Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +4 -0
Screenshot 2025-11-07 at 1.03.27 PM.png +3 -0
duel_dqn.py +178 -0
flux_krea_00776_.png +3 -0
output.mp4 +3 -0
pyqt5_duel_dqn_super_mario_bros_tutorial/app.py +2294 -0
pyqt5_duel_dqn_super_mario_bros_tutorial/flux_krea_00776_.png +3 -0
pyqt5_duel_dqn_super_mario_bros_tutorial/installed_packages_dqn.txt +4 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+flux_krea_00776_.png filter=lfs diff=lfs merge=lfs -text
+output.mp4 filter=lfs diff=lfs merge=lfs -text
+pyqt5_duel_dqn_super_mario_bros_tutorial/flux_krea_00776_.png filter=lfs diff=lfs merge=lfs -text
+Screenshot[[:space:]]2025-11-07[[:space:]]at[[:space:]]1.03.27 PM.png filter=lfs diff=lfs merge=lfs -text

Screenshot 2025-11-07 at 1.03.27 PM.png ADDED Viewed

Git LFS Details

SHA256: bdea24be2cb2a231d03b1d78b970728f7d4e4a891dae76d9d715e00bf5fc0a5f
Pointer size: 131 Bytes
Size of remote file: 755 kB

duel_dqn.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import pickle
+import random
+import time
+from collections import deque
+import gym_super_mario_bros
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
+from nes_py.wrappers import JoypadSpace
+from wrappers import *
+def arrange(s):
+    if not type(s) == "numpy.ndarray":
+        s = np.array(s)
+    assert len(s.shape) == 3
+    ret = np.transpose(s, (2, 0, 1))
+    return np.expand_dims(ret, 0)
+class replay_memory(object):
+    def __init__(self, N):
+        self.memory = deque(maxlen=N)
+    def push(self, transition):
+        self.memory.append(transition)
+    def sample(self, n):
+        return random.sample(self.memory, n)
+    def __len__(self):
+        return len(self.memory)
+class model(nn.Module):
+    def __init__(self, n_frame, n_action, device):
+        super(model, self).__init__()
+        self.layer1 = nn.Conv2d(n_frame, 32, 8, 4)
+        self.layer2 = nn.Conv2d(32, 64, 3, 1)
+        self.fc = nn.Linear(20736, 512)
+        self.q = nn.Linear(512, n_action)
+        self.v = nn.Linear(512, 1)
+        self.device = device
+        self.seq = nn.Sequential(self.layer1, self.layer2, self.fc, self.q, self.v)
+        self.seq.apply(init_weights)
+    def forward(self, x):
+        if type(x) != torch.Tensor:
+            x = torch.FloatTensor(x).to(self.device)
+        x = torch.relu(self.layer1(x))
+        x = torch.relu(self.layer2(x))
+        x = x.view(-1, 20736)
+        x = torch.relu(self.fc(x))
+        adv = self.q(x)
+        v = self.v(x)
+        q = v + (adv - 1 / adv.shape[-1] * adv.sum(-1, keepdim=True))
+        return q
+def init_weights(m):
+    if type(m) == nn.Conv2d:
+        torch.nn.init.xavier_uniform_(m.weight)
+        m.bias.data.fill_(0.01)
+def train(q, q_target, memory, batch_size, gamma, optimizer, device):
+    s, r, a, s_prime, done = list(map(list, zip(*memory.sample(batch_size))))
+    s = np.array(s).squeeze()
+    s_prime = np.array(s_prime).squeeze()
+    a_max = q(s_prime).max(1)[1].unsqueeze(-1)
+    r = torch.FloatTensor(r).unsqueeze(-1).to(device)
+    done = torch.FloatTensor(done).unsqueeze(-1).to(device)
+    with torch.no_grad():
+        y = r + gamma * q_target(s_prime).gather(1, a_max) * done
+    a = torch.tensor(a).unsqueeze(-1).to(device)
+    q_value = torch.gather(q(s), dim=1, index=a.view(-1, 1).long())
+    loss = F.smooth_l1_loss(q_value, y).mean()
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
+    return loss
+def copy_weights(q, q_target):
+    q_dict = q.state_dict()
+    q_target.load_state_dict(q_dict)
+def main(env, q, q_target, optimizer, device):
+    t = 0
+    gamma = 0.99
+    batch_size = 256
+    N = 50000
+    eps = 0.001
+    memory = replay_memory(N)
+    update_interval = 50
+    print_interval = 10
+    score_lst = []
+    total_score = 0.0
+    loss = 0.0
+    start_time = time.perf_counter()
+    for k in range(1000000):
+        s = arrange(env.reset())
+        done = False
+        while not done:
+            if eps > np.random.rand():
+                a = env.action_space.sample()
+            else:
+                if device == "cpu":
+                    a = np.argmax(q(s).detach().numpy())
+                else:
+                    a = np.argmax(q(s).cpu().detach().numpy())
+            s_prime, r, done, _ = env.step(a)
+            s_prime = arrange(s_prime)
+            total_score += r
+            r = np.sign(r) * (np.sqrt(abs(r) + 1) - 1) + 0.001 * r
+            memory.push((s, float(r), int(a), s_prime, int(1 - done)))
+            s = s_prime
+            stage = env.unwrapped._stage
+            if len(memory) > 2000:
+                loss += train(q, q_target, memory, batch_size, gamma, optimizer, device)
+                t += 1
+            if t % update_interval == 0:
+                copy_weights(q, q_target)
+                torch.save(q.state_dict(), "mario_q.pth")
+                torch.save(q_target.state_dict(), "mario_q_target.pth")
+        if k % print_interval == 0:
+            time_spent, start_time = (
+                time.perf_counter() - start_time,
+                time.perf_counter(),
+            )
+            print(
+                "%s |Epoch : %d | score : %f | loss : %.2f | stage : %d | time spent: %f"
+                % (
+                    device,
+                    k,
+                    total_score / print_interval,
+                    loss / print_interval,
+                    stage,
+                    time_spent,
+                )
+            )
+            score_lst.append(total_score / print_interval)
+            total_score = 0
+            loss = 0.0
+            pickle.dump(score_lst, open("score.p", "wb"))
+if __name__ == "__main__":
+    n_frame = 4
+    env = gym_super_mario_bros.make("SuperMarioBros-v0")
+    env = JoypadSpace(env, COMPLEX_MOVEMENT)
+    env = wrap_mario(env)
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+    q = model(n_frame, env.action_space.n, device).to(device)
+    q_target = model(n_frame, env.action_space.n, device).to(device)
+    optimizer = optim.Adam(q.parameters(), lr=0.0001)
+    print(device)
+    main(env, q, q_target, optimizer, device)

flux_krea_00776_.png ADDED Viewed

Git LFS Details

SHA256: 9b38502348cc00c0ea8949e1165b55bee8ec9c8ec35fc8d9143fa571b5a49e98
Pointer size: 132 Bytes
Size of remote file: 1.37 MB

output.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0f705a6effd35fbb11e8e5a337669c42ad244c62ba84a084f95f813a6fb981a
+size 57430066

pyqt5_duel_dqn_super_mario_bros_tutorial/app.py ADDED Viewed

	@@ -0,0 +1,2294 @@

+import sys
+import os
+from PyQt5.QtWidgets import (QApplication, QMainWindow, QWidget, QVBoxLayout,
+                            QHBoxLayout, QTextEdit, QPushButton, QTabWidget,
+                            QLabel, QScrollArea, QFrame, QSplitter, QListWidget,
+                            QListWidgetItem, QGroupBox, QProgressBar, QSpinBox,
+                            QDoubleSpinBox, QCheckBox, QComboBox, QMessageBox)
+from PyQt5.QtCore import Qt, QTimer, pyqtSignal
+from PyQt5.QtGui import QFont, QPalette, QColor, QPixmap
+import markdown
+class CodeExerciseWidget(QWidget):
+    exerciseCompleted = pyqtSignal(str, bool)  # section_name, completed
+    def __init__(self, title, description, code_template, solution_code, section_name):
+        super().__init__()
+        self.title = title
+        self.description = description
+        self.code_template = code_template
+        self.solution_code = solution_code
+        self.section_name = section_name
+        self.is_completed = False
+        self.init_ui()
+    def init_ui(self):
+        layout = QVBoxLayout()
+        # Title
+        title_label = QLabel(f"<h2>{self.title}</h2>")
+        layout.addWidget(title_label)
+        # Description
+        desc_label = QLabel(self.description)
+        desc_label.setWordWrap(True)
+        layout.addWidget(desc_label)
+        # Code editor area
+        code_group = QGroupBox("Your Implementation")
+        code_layout = QVBoxLayout()
+        self.code_editor = QTextEdit()
+        self.code_editor.setPlainText(self.code_template)
+        self.code_editor.setFont(QFont("Courier", 10))
+        code_layout.addWidget(self.code_editor)
+        # Buttons
+        button_layout = QHBoxLayout()
+        self.run_btn = QPushButton("Run Code")
+        self.run_btn.clicked.connect(self.run_code)
+        self.solution_btn = QPushButton("Show Solution")
+        self.solution_btn.clicked.connect(self.show_solution)
+        self.reset_btn = QPushButton("Reset")
+        self.reset_btn.clicked.connect(self.reset_code)
+        self.complete_btn = QPushButton("Mark as Completed")
+        self.complete_btn.clicked.connect(self.mark_completed)
+        self.complete_btn.setStyleSheet("background-color: #28a745; color: white;")
+        button_layout.addWidget(self.run_btn)
+        button_layout.addWidget(self.solution_btn)
+        button_layout.addWidget(self.reset_btn)
+        button_layout.addWidget(self.complete_btn)
+        code_layout.addLayout(button_layout)
+        code_group.setLayout(code_layout)
+        layout.addWidget(code_group)
+        # Output area
+        output_group = QGroupBox("Output")
+        output_layout = QVBoxLayout()
+        self.output_text = QTextEdit()
+        self.output_text.setReadOnly(True)
+        self.output_text.setFont(QFont("Courier", 9))
+        output_layout.addWidget(self.output_text)
+        output_group.setLayout(output_layout)
+        layout.addWidget(output_group)
+        self.setLayout(layout)
+    def run_code(self):
+        # In a real implementation, this would execute the code safely
+        # For this example, we'll just show a message
+        self.output_text.append("Running your code...")
+        self.output_text.append("(In a full implementation, this would execute the Python code)")
+    def show_solution(self):
+        self.code_editor.setPlainText(self.solution_code)
+    def reset_code(self):
+        self.code_editor.setPlainText(self.code_template)
+        self.output_text.clear()
+    def mark_completed(self):
+        self.is_completed = True
+        self.complete_btn.setEnabled(False)
+        self.complete_btn.setText("✓ Completed")
+        self.complete_btn.setStyleSheet("background-color: #6c757d; color: white;")
+        self.output_text.append("\n✓ Exercise marked as completed!")
+        self.exerciseCompleted.emit(self.section_name, True)
+class SectionCompletionWidget(QWidget):
+    def __init__(self, section_name):
+        super().__init__()
+        self.section_name = section_name
+        self.is_completed = False
+        self.init_ui()
+    def init_ui(self):
+        layout = QHBoxLayout()
+        self.checkbox = QCheckBox(f"Complete {self.section_name}")
+        self.checkbox.stateChanged.connect(self.on_checkbox_changed)
+        layout.addWidget(self.checkbox)
+        layout.addStretch()
+        self.setLayout(layout)
+    def on_checkbox_changed(self, state):
+        self.is_completed = (state == Qt.Checked)
+class DuelingDQNTutorialApp(QMainWindow):
+    def __init__(self):
+        super().__init__()
+        self.setWindowTitle("Dueling DQN for Super Mario Bros - Interactive Tutorial")
+        self.setGeometry(100, 100, 1200, 800)
+        # Track completion state
+        self.section_completion = {
+            "Introduction": False,
+            "Dueling DQN Theory": False,
+            "Environment Setup": False,
+            "Replay Memory": False,
+            "Neural Network": False,
+            "Training Algorithm": False,
+            "Complete Implementation": False,
+            "Exercises": False
+        }
+        self.exercise_completion = {
+            "Replay Memory": False,
+            "Dueling DQN": False,
+            "Environment Wrapper": False,
+            "Training Loop": False,
+            "Reward Shaping": False,
+            "Model Saving": False,
+            "Hyperparameter Tuning": False,
+            "Evaluation": False
+        }
+        self.init_ui()
+    def init_ui(self):
+        # Central widget and main layout
+        central_widget = QWidget()
+        self.setCentralWidget(central_widget)
+        main_layout = QHBoxLayout(central_widget)
+        # Left sidebar with navigation
+        left_sidebar = self.create_sidebar()
+        main_layout.addWidget(left_sidebar)
+        # Right content area
+        right_content = self.create_content_area()
+        main_layout.addWidget(right_content)
+        # Apply styling
+        self.apply_styling()
+    def create_sidebar(self):
+        sidebar = QWidget()
+        sidebar.setFixedWidth(300)
+        sidebar_layout = QVBoxLayout(sidebar)
+        # Title
+        title_label = QLabel("Dueling DQN Tutorial")
+        title_label.setFont(QFont("Arial", 14, QFont.Bold))
+        title_label.setAlignment(Qt.AlignCenter)
+        sidebar_layout.addWidget(title_label)
+        # Navigation list
+        self.nav_list = QListWidget()
+        self.nav_list.addItems([
+            "Introduction",
+            "Dueling DQN Theory",
+            "Environment Setup",
+            "Replay Memory",
+            "Neural Network",
+            "Training Algorithm",
+            "Complete Implementation",
+            "Exercises"
+        ])
+        self.nav_list.currentRowChanged.connect(self.change_content)
+        sidebar_layout.addWidget(self.nav_list)
+        # Section completion tracking
+        completion_group = QGroupBox("Section Completion")
+        completion_layout = QVBoxLayout()
+        self.section_widgets = {}
+        for section in self.section_completion.keys():
+            widget = SectionCompletionWidget(section)
+            widget.checkbox.stateChanged.connect(self.update_progress)
+            self.section_widgets[section] = widget
+            completion_layout.addWidget(widget)
+        completion_group.setLayout(completion_layout)
+        sidebar_layout.addWidget(completion_group)
+        # Progress section
+        progress_group = QGroupBox("Your Progress")
+        progress_layout = QVBoxLayout()
+        self.progress_bar = QProgressBar()
+        self.progress_bar.setValue(0)
+        progress_layout.addWidget(QLabel("Overall Progress:"))
+        progress_layout.addWidget(self.progress_bar)
+        self.exercise_count = QLabel("Exercises: 0/8 completed")
+        progress_layout.addWidget(self.exercise_count)
+        self.section_count = QLabel("Sections: 0/8 completed")
+        progress_layout.addWidget(self.section_count)
+        progress_group.setLayout(progress_layout)
+        sidebar_layout.addWidget(progress_group)
+        sidebar_layout.addStretch()
+        return sidebar
+    def create_content_area(self):
+        # Create tab widget for different content sections
+        self.content_tabs = QTabWidget()
+        # Introduction tab
+        intro_tab = self.create_intro_tab()
+        self.content_tabs.addTab(intro_tab, "Introduction")
+        # Theory tab
+        theory_tab = self.create_theory_tab()
+        self.content_tabs.addTab(theory_tab, "Dueling DQN Theory")
+        # Environment tab
+        env_tab = self.create_environment_tab()
+        self.content_tabs.addTab(env_tab, "Environment Setup")
+        # Replay Memory tab
+        memory_tab = self.create_memory_tab()
+        self.content_tabs.addTab(memory_tab, "Replay Memory")
+        # Neural Network tab
+        nn_tab = self.create_nn_tab()
+        self.content_tabs.addTab(nn_tab, "Neural Network")
+        # Training tab
+        training_tab = self.create_training_tab()
+        self.content_tabs.addTab(training_tab, "Training Algorithm")
+        # Implementation tab
+        impl_tab = self.create_implementation_tab()
+        self.content_tabs.addTab(impl_tab, "Complete Implementation")
+        # Exercises tab
+        exercises_tab = self.create_exercises_tab()
+        self.content_tabs.addTab(exercises_tab, "Exercises")
+        return self.content_tabs
+    def create_intro_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        # Title
+        title_label = QLabel("<h1 style='text-align: center; color: #2c3e50;'>Dueling DQN for Super Mario Bros</h1>")
+        title_label.setAlignment(Qt.AlignCenter)
+        layout.addWidget(title_label)
+        # Image section with better styling
+        image_frame = QFrame()
+        image_frame.setFrameStyle(QFrame.Box)
+        image_frame.setLineWidth(1)
+        image_frame.setMidLineWidth(0)
+        image_frame.setStyleSheet("QFrame { border: 1px solid #ddd; border-radius: 8px; background-color: #fafafa; }")
+        image_layout = QVBoxLayout(image_frame)
+        # Try multiple possible image locations
+        image_loaded = False
+        possible_paths = [
+            "flux_krea_00776_.png",
+            "./flux_krea_00776_.png",
+            "images/flux_krea_00776_.png",
+            "../flux_krea_00776_.png"
+        ]
+        pixmap = None
+        for path in possible_paths:
+            if os.path.exists(path):
+                pixmap = QPixmap(path)
+                if not pixmap.isNull():
+                    image_loaded = True
+                    break
+        if image_loaded and pixmap:
+            image_label = QLabel()
+            # Scale image to reasonable size while maintaining aspect ratio
+            scaled_pixmap = pixmap.scaled(550, 350, Qt.KeepAspectRatio, Qt.SmoothTransformation)
+            image_label.setPixmap(scaled_pixmap)
+            image_label.setAlignment(Qt.AlignCenter)
+            image_label.setStyleSheet("padding: 10px;")
+            image_layout.addWidget(image_label)
+            # Image caption
+            caption = QLabel("<small><em>Dueling DQN Architecture Visualization generated by flux.1-krea.dev</em></small>")
+            caption.setAlignment(Qt.AlignCenter)
+            caption.setStyleSheet("color: #666; font-style: italic; padding: 5px;")
+            image_layout.addWidget(caption)
+        else:
+            # Fallback placeholder
+            placeholder = QLabel("🎨 Architecture Visualization Image\n<small><em>flux_krea_00776_.png</em></small>")
+            placeholder.setAlignment(Qt.AlignCenter)
+            placeholder.setStyleSheet("color: #999; font-style: italic; padding: 40px; background-color: #f0f0f0; border-radius: 4px;")
+            image_layout.addWidget(placeholder)
+        layout.addWidget(image_frame)
+        # Welcome section
+        welcome_text = """
+        <h2 style='color: #34495e;'>Welcome to the Interactive Tutorial!</h2>
+        <p style='line-height: 1.6;'>This application will guide you through implementing a <strong>Dueling Deep Q-Network (DQN)</strong>
+        to play Super Mario Bros. You'll learn both the theory behind Dueling DQN and get
+        hands-on experience building each component through interactive exercises.</p>
+        <div style='background-color: #e8f4fd; padding: 15px; border-radius: 5px; border-left: 4px solid #3498db;'>
+        <strong>🚀 Interactive Learning:</strong> This tutorial combines theoretical explanations with
+        hands-on coding exercises. Complete each section and exercise to track your progress!
+        </div>
+        <h3 style='color: #2c3e50; margin-top: 20px;'>What You'll Learn:</h3>
+        <ul style='line-height: 1.6;'>
+            <li><strong>Mathematical Foundation:</strong> Understand the dueling architecture that separates value and advantage streams</li>
+            <li><strong>Environment Setup:</strong> Configure the Super Mario Bros environment with proper preprocessing</li>
+            <li><strong>Experience Replay:</strong> Implement memory buffer for stable training</li>
+            <li><strong>Neural Network:</strong> Build the dueling DQN with PyTorch</li>
+            <li><strong>Training Algorithm:</strong> Master the DQN training loop with target networks</li>
+            <li><strong>Complete Implementation:</strong> Combine all components into a working AI agent</li>
+        </ul>
+        <h3 style='color: #2c3e50;'>Prerequisites:</h3>
+        <ul style='line-height: 1.6;'>
+            <li>Basic Python programming knowledge</li>
+            <li>Familiarity with PyTorch (helpful but not required)</li>
+            <li>Understanding of basic reinforcement learning concepts</li>
+            <li>Curiosity to build an AI that can play Super Mario Bros! 🎮</li>
+        </ul>
+        <div style='background-color: #fff3cd; padding: 15px; border-radius: 5px; border-left: 4px solid #ffc107; margin-top: 20px;'>
+        <strong>💡 Pro Tip:</strong> Use the navigation panel on the left to move through sections.
+        Mark sections as completed and work through exercises to build your understanding step by step.
+        Each section builds upon the previous one!
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(welcome_text)
+        text_edit.setReadOnly(True)
+        text_edit.setStyleSheet("""
+            QTextEdit {
+                background-color: white;
+                border: 1px solid #ddd;
+                border-radius: 5px;
+                padding: 10px;
+                line-height: 1.6;
+            }
+        """)
+        layout.addWidget(text_edit)
+        return widget
+    def create_theory_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        content = """
+        <h1>Dueling DQN Theory</h1>
+        <h2>What is Dueling DQN?</h2>
+        <p>Dueling DQN is an improvement over the standard Deep Q-Network that separates
+        the estimation of state value and action advantages. This architecture leads to
+        more stable and efficient learning.</p>
+        <h2>Mathematical Foundation</h2>
+        <p>In standard DQN, we estimate Q-values directly:</p>
+        <p style="text-align: center;"><code>Q(s, a) = f(s, a)</code></p>
+        <p>In Dueling DQN, we decompose the Q-value into two streams:</p>
+        <p style="text-align: center;"><code>Q(s, a) = V(s) + A(s, a)</code></p>
+        <p>Where:</p>
+        <ul>
+            <li><strong>V(s)</strong>: Value function - how good it is to be in state s</li>
+            <li><strong>A(s, a)</strong>: Advantage function - how much better action a is compared to other actions in state s</li>
+        </ul>
+        <p>To ensure identifiability, we use the following aggregation:</p>
+        <p style="text-align: center;"><code>Q(s, a) = V(s) + (A(s, a) - mean(A(s, ·)))</code></p>
+        <h2>Benefits of Dueling Architecture</h2>
+        <h3>Compared to Standard DQN:</h3>
+        <ul>
+            <li>Better generalization across actions</li>
+            <li>More stable learning</li>
+            <li>Faster convergence in many environments</li>
+            <li>Ability to learn which states are valuable without having to learn the effect of each action</li>
+        </ul>
+        <h3>Compared to PPO (Proximal Policy Optimization):</h3>
+        <table border="1" style="width:100%">
+            <tr>
+                <th>Dueling DQN</th>
+                <th>PPO</th>
+            </tr>
+            <tr>
+                <td>Value-based method</td>
+                <td>Policy-based method</td>
+            </tr>
+            <tr>
+                <td>Off-policy learning</td>
+                <td>On-policy learning</td>
+            </tr>
+            <tr>
+                <td>Discrete action spaces</td>
+                <td>Continuous or discrete action spaces</td>
+            </tr>
+            <tr>
+                <td>Generally more sample-efficient</td>
+                <td>Generally more stable</td>
+            </tr>
+            <tr>
+                <td>Easier to implement and debug</td>
+                <td>More hyperparameters to tune</td>
+            </tr>
+        </table>
+        <p>For Super Mario Bros, which has discrete actions, Dueling DQN strikes a good
+        balance between sample efficiency and implementation complexity.</p>
+        <h2>Architecture Diagram</h2>
+        <p style="text-align: center;">
+        [Convolutional Layers] → [Feature Vector] → [Value Stream] + [Advantage Stream] → [Q-values]
+        </p>
+        <p>The convolutional layers process the game frames, then the network splits into
+        two streams that estimate V(s) and A(s, a) separately, which are then combined
+        to produce the final Q-values.</p>
+        <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 20px;">
+        <strong>Key Insight:</strong> The dueling architecture allows the network to learn
+        which states are inherently valuable without having to learn the value of each action
+        in those states. This is particularly useful in games like Mario where many actions
+        lead to similar outcomes in safe states.
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(content)
+        text_edit.setReadOnly(True)
+        layout.addWidget(text_edit)
+        return widget
+    def create_environment_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        content = """
+        <h1>Environment Setup</h1>
+        <h2>Setting up Super Mario Bros</h2>
+        <p>We use the <code>gym-super-mario-bros</code> package to create our environment.
+        This provides a standardized interface to interact with the game.</p>
+        <h3>Key Components:</h3>
+        <pre><code>
+    import gym_super_mario_bros
+    from nes_py.wrappers import JoypadSpace
+    from gym_super_mario_bros.actions import COMPLEX_MOVEMENT
+    from wrappers import *
+        </code></pre>
+        <h3>Environment Wrappers</h3>
+        <p>We apply several wrappers to preprocess the environment:</p>
+        <ul>
+            <li><strong>JoypadSpace</strong>: Limits actions to a predefined set (COMPLEX_MOVEMENT)</li>
+            <li><strong>wrap_mario</strong>: Custom wrapper that applies:
+                <ul>
+                    <li>Frame skipping</li>
+                    <li>Frame stacking</li>
+                    <li>Reward scaling</li>
+                    <li>Observation preprocessing</li>
+                </ul>
+            </li>
+        </ul>
+        <h3>Observation Preprocessing</h3>
+        <p>The <code>arrange</code> function transforms the observation:</p>
+        <pre><code>
+    def arrange(s):
+        if not type(s) == "numpy.ndarray":
+            s = np.array(s)
+        assert len(s.shape) == 3
+        ret = np.transpose(s, (2, 0, 1))
+        return np.expand_dims(ret, 0)
+        </code></pre>
+        <p>This function:</p>
+        <ol>
+            <li>Converts the observation to a numpy array if needed</li>
+            <li>Transposes dimensions from (H, W, C) to (C, H, W)</li>
+            <li>Adds a batch dimension</li>
+        </ol>
+        <h3>Action Space</h3>
+        <p>COMPLEX_MOVEMENT provides 12 possible actions that combine button presses:</p>
+        <ul>
+            <li>NOOP - No operation</li>
+            <li>RIGHT - Move right</li>
+            <li>RIGHT+A - Right and jump</li>
+            <li>RIGHT+B - Right and run</li>
+            <li>RIGHT+A+B - Right, jump and run</li>
+            <li>A - Jump</li>
+            <li>LEFT - Move left</li>
+            <li>LEFT+A - Left and jump</li>
+            <li>LEFT+B - Left and run</li>
+            <li>LEFT+A+B - Left, jump and run</li>
+            <li>DOWN - Duck</li>
+            <li>UP - Look up</li>
+        </ul>
+        <h3>Installation Requirements</h3>
+        <pre><code>
+    pip install gym-super-mario-bros
+    pip install nes_py
+    pip install opencv-python
+    pip install torch
+        </code></pre>
+        <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 20px;">
+        <strong>Note:</strong> The environment returns RGB frames of shape (240, 256, 3).
+        We preprocess these to (4, 84, 84) by stacking 4 grayscale frames and resizing.
+        This significantly reduces the input dimensionality while preserving temporal information.
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(content)
+        text_edit.setReadOnly(True)
+        layout.addWidget(text_edit)
+        return widget
+    def create_memory_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        content = """
+        <h1>Replay Memory</h1>
+        <h2>Experience Replay</h2>
+        <p>Experience replay is a key component of DQN that helps stabilize training
+        by breaking correlations in sequential observations.</p>
+        <h3>Implementation</h3>
+        <pre><code>
+    class replay_memory(object):
+        def __init__(self, N):
+            self.memory = deque(maxlen=N)
+        def push(self, transition):
+            self.memory.append(transition)
+        def sample(self, n):
+            return random.sample(self.memory, n)
+        def __len__(self):
+            return len(self.memory)
+        </code></pre>
+        <h3>Key Features:</h3>
+        <ul>
+            <li><strong>Fixed Size</strong>: Uses deque with maxlen to maintain a fixed memory size</li>
+            <li><strong>Random Sampling</strong>: Samples random batches to break temporal correlations</li>
+            <li><strong>Transition Storage</strong>: Stores (state, action, reward, next_state, done) tuples</li>
+        </ul>
+        <h3>Why Use Experience Replay?</h3>
+        <ol>
+            <li><strong>Data Efficiency</strong>: Each experience can be used multiple times</li>
+            <li><strong>Reduced Correlation</strong>: Random sampling breaks correlation between consecutive samples</li>
+            <li><strong>Smoother Learning</strong>: Averages behavior distribution over many previous states</li>
+            <li><strong>Stable Training</strong>: Preforms like training on a stationary distribution</li>
+        </ol>
+        <h3>Transition Format</h3>
+        <p>Each transition stored in memory contains:</p>
+        <ul>
+            <li><strong>s</strong>: Current state (preprocessed frame)</li>
+            <li><strong>a</strong>: Action taken</li>
+            <li><strong>r</strong>: Reward received (transformed)</li>
+            <li><strong>s'</strong>: Next state</li>
+            <li><strong>done</strong>: Whether the episode ended (stored as 1-done for terminal states)</li>
+        </ul>
+        <h3>Memory Management</h3>
+        <p>The replay memory has a fixed capacity (N=50000 in our implementation).
+        When the memory is full, older experiences are automatically removed as new ones are added.
+        This ensures the agent always trains on recent experiences while maintaining diversity.</p>
+        <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 20px;">
+        <strong>Best Practice:</strong> Wait until the replay memory has sufficient samples
+        (typically 1000-5000) before starting training to ensure diverse batches and
+        avoid overfitting to early experiences.
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(content)
+        text_edit.setReadOnly(True)
+        layout.addWidget(text_edit)
+        return widget
+    def create_nn_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        content = """
+        <h1>Neural Network Architecture</h1>
+        <h2>Dueling DQN Model</h2>
+        <p>The neural network implements the dueling architecture with separate streams
+        for value and advantage estimation.</p>
+        <h3>Network Structure</h3>
+        <pre><code>
+    class model(nn.Module):
+        def __init__(self, n_frame, n_action, device):
+            super(model, self).__init__()
+            self.layer1 = nn.Conv2d(n_frame, 32, 8, 4)
+            self.layer2 = nn.Conv2d(32, 64, 3, 1)
+            self.fc = nn.Linear(20736, 512)
+            self.q = nn.Linear(512, n_action)  # Advantage stream
+            self.v = nn.Linear(512, 1)         # Value stream
+            self.device = device
+        def forward(self, x):
+            if type(x) != torch.Tensor:
+                x = torch.FloatTensor(x).to(self.device)
+            x = torch.relu(self.layer1(x))
+            x = torch.relu(self.layer2(x))
+            x = x.view(-1, 20736)
+            x = torch.relu(self.fc(x))
+            adv = self.q(x)  # Advantage stream
+            v = self.v(x)    # Value stream
+            # Combine using dueling formula
+            q = v + (adv - 1 / adv.shape[-1] * adv.sum(-1, keepdim=True))
+            return q
+        </code></pre>
+        <h3>Layer Details</h3>
+        <ul>
+            <li><strong>Conv2d(4, 32, 8, 4)</strong>: 32 filters, kernel size 8, stride 4</li>
+            <li><strong>Conv2d(32, 64, 3, 1)</strong>: 64 filters, kernel size 3, stride 1</li>
+            <li><strong>Linear(20736, 512)</strong>: Fully connected layer with 512 units</li>
+            <li><strong>Value Head</strong>: Single output estimating V(s)</li>
+            <li><strong>Advantage Head</strong>: n_action outputs estimating A(s, a)</li>
+        </ul>
+        <h3>Dueling Combination</h3>
+        <p>The key innovation is how we combine value and advantage:</p>
+        <pre><code>
+    q = v + (adv - 1 / adv.shape[-1] * adv.sum(-1, keepdim=True))
+        </code></pre>
+        <p>This ensures that:</p>
+        <ul>
+            <li>The advantage function has zero mean for each state</li>
+            <li>We can recover both V(s) and A(s, a) from Q(s, a)</li>
+            <li>The network learns which states are valuable without having to learn
+            the effect of each action in every state</li>
+            <li>The value stream focuses on state quality, advantage stream on action quality</li>
+        </ul>
+        <h3>Weight Initialization</h3>
+        <pre><code>
+    def init_weights(m):
+        if type(m) == nn.Conv2d:
+            torch.nn.init.xavier_uniform_(m.weight)
+            m.bias.data.fill_(0.01)
+        </code></pre>
+        <p>We use Xavier initialization for convolutional layers to maintain stable gradients
+        throughout the network. This helps with convergence during training.</p>
+        <h3>Feature Extraction</h3>
+        <p>The convolutional layers extract spatial features from the input frames:</p>
+        <ul>
+            <li><strong>First conv layer</strong>: Detects basic features like edges and colors</li>
+            <li><strong>Second conv layer</strong>: Combines basic features into more complex patterns</li>
+            <li><strong>Flattening</strong>: Converts spatial features to a 1D vector for the fully connected layers</li>
+        </ul>
+        <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 20px;">
+        <strong>Architecture Choice:</strong> The 20736 value comes from the flattened output
+        of the second convolutional layer. This is calculated based on the input dimensions
+        and the network architecture. For a (4, 84, 84) input, the convolutions produce
+        feature maps that when flattened give 20736 elements.
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(content)
+        text_edit.setReadOnly(True)
+        layout.addWidget(text_edit)
+        return widget
+    def create_training_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        content = """
+        <h1>Training Algorithm</h1>
+        <h2>DQN Training Loop</h2>
+        <p>The training process follows the standard DQN algorithm with the addition
+        of the dueling architecture.</p>
+        <h3>Key Components</h3>
+        <ul>
+            <li><strong>Target Network</strong>: Separate network for stable Q-targets</li>
+            <li><strong>Experience Replay</strong>: Stores and samples past experiences</li>
+            <li><strong>Epsilon-Greedy Exploration</strong>: Balances exploration and exploitation</li>
+            <li><strong>Periodic Updates</strong>: Syncs target network with main network</li>
+            <li><strong>Gradient Clipping</strong>: Prevents exploding gradients (implicit in smooth_l1_loss)</li>
+        </ul>
+        <h3>Training Function</h3>
+        <pre><code>
+    def train(q, q_target, memory, batch_size, gamma, optimizer, device):
+        s, r, a, s_prime, done = list(map(list, zip(*memory.sample(batch_size))))
+        s = np.array(s).squeeze()
+        s_prime = np.array(s_prime).squeeze()
+        a_max = q(s_prime).max(1)[1].unsqueeze(-1)
+        r = torch.FloatTensor(r).unsqueeze(-1).to(device)
+        done = torch.FloatTensor(done).unsqueeze(-1).to(device)
+        with torch.no_grad():
+            y = r + gamma * q_target(s_prime).gather(1, a_max) * done
+        a = torch.tensor(a).unsqueeze(-1).to(device)
+        q_value = torch.gather(q(s), dim=1, index=a.view(-1, 1).long())
+        loss = F.smooth_l1_loss(q_value, y).mean()
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        return loss
+        </code></pre>
+        <h3>Loss Calculation</h3>
+        <p>We use Smooth L1 Loss (Huber loss) which is more robust to outliers than MSE:</p>
+        <pre><code>
+    loss = F.smooth_l1_loss(q_value, y).mean()
+        </code></pre>
+        <p>Smooth L1 Loss behaves like L1 loss when the error is large (reducing sensitivity to outliers)
+        and like L2 loss when the error is small (providing smooth gradients).</p>
+        <h3>Target Calculation</h3>
+        <p>The target Q-values are calculated using the target network:</p>
+        <pre><code>
+    y = r + gamma * q_target(s_prime).gather(1, a_max) * done
+        </code></pre>
+        <p>Where:</p>
+        <ul>
+            <li><strong>r</strong>: Immediate reward</li>
+            <li><strong>gamma</strong>: Discount factor (0.99)</li>
+            <li><strong>q_target(s_prime)</strong>: Q-values from target network</li>
+            <li><strong>a_max</strong>: Best action according to online network (Double DQN)</li>
+            <li><strong>done</strong>: Terminal state indicator (0 for terminal states)</li>
+        </ul>
+        <h3>Double DQN</h3>
+        <p>Note that we use the Double DQN approach: the online network selects the action,
+        but the target network evaluates it. This reduces overestimation bias.</p>
+        <h3>Main Training Loop</h3>
+        <p>The main function handles:</p>
+        <ol>
+            <li>Environment interaction and experience collection</li>
+            <li>Epsilon-greedy action selection</li>
+            <li>Reward transformation and experience storage</li>
+            <li>Network training with experience replay</li>
+            <li>Target network updates at fixed intervals</li>
+            <li>Model checkpointing and progress logging</li>
+        </ol>
+        <h3>Hyperparameters</h3>
+        <ul>
+            <li><strong>Gamma</strong>: 0.99 (discount factor for future rewards)</li>
+            <li><strong>Batch Size</strong>: 256 (experiences per training step)</li>
+            <li><strong>Memory Size</strong>: 50,000 (maximum replay buffer size)</li>
+            <li><strong>Learning Rate</strong>: 0.0001 (Adam optimizer)</li>
+            <li><strong>Update Interval</strong>: 50 steps (target network sync frequency)</li>
+            <li><strong>Epsilon</strong>: 0.001 (fixed exploration rate during training)</li>
+            <li><strong>Print Interval</strong>: 10 episodes (progress reporting frequency)</li>
+        </ul>
+        <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 20px;">
+        <strong>Training Strategy:</strong> The agent starts training only after collecting
+        2000 experiences to ensure diverse training data. The fixed epsilon of 0.001 means
+        the agent explores 0.1% of the time, exploiting its learned policy 99.9% of the time.
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(content)
+        text_edit.setReadOnly(True)
+        layout.addWidget(text_edit)
+        return widget
+    def create_implementation_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        content = """
+        <h1>Complete Implementation</h1>
+        <h2>Putting It All Together</h2>
+        <p>Now let's look at the complete implementation that brings all components together.</p>
+        <h3>Main Function</h3>
+        <pre><code>
+    def main(env, q, q_target, optimizer, device):
+        t = 0
+        gamma = 0.99
+        batch_size = 256
+        N = 50000
+        eps = 0.001
+        memory = replay_memory(N)
+        update_interval = 50
+        print_interval = 10
+        score_lst = []
+        total_score = 0.0
+        loss = 0.0
+        start_time = time.perf_counter()
+        for k in range(1000000):
+            s = arrange(env.reset())
+            done = False
+            while not done:
+                if eps > np.random.rand():
+                    a = env.action_space.sample()
+                else:
+                    if device == "cpu":
+                        a = np.argmax(q(s).detach().numpy())
+                    else:
+                        a = np.argmax(q(s).cpu().detach().numpy())
+                s_prime, r, done, _ = env.step(a)
+                s_prime = arrange(s_prime)
+                total_score += r
+                r = np.sign(r) * (np.sqrt(abs(r) + 1) - 1) + 0.001 * r
+                memory.push((s, float(r), int(a), s_prime, int(1 - done)))
+                s = s_prime
+                stage = env.unwrapped._stage
+                if len(memory) > 2000:
+                    loss += train(q, q_target, memory, batch_size, gamma, optimizer, device)
+                    t += 1
+                if t % update_interval == 0:
+                    copy_weights(q, q_target)
+                    torch.save(q.state_dict(), "mario_q.pth")
+                    torch.save(q_target.state_dict(), "mario_q_target.pth")
+            if k % print_interval == 0:
+                time_spent, start_time = (
+                    time.perf_counter() - start_time,
+                    time.perf_counter(),
+                )
+                print(
+                    "%s |Epoch : %d | score : %f | loss : %.2f | stage : %d | time spent: %f"
+                    % (
+                        device,
+                        k,
+                        total_score / print_interval,
+                        loss / print_interval,
+                        stage,
+                        time_spent,
+                    )
+                )
+                score_lst.append(total_score / print_interval)
+                total_score = 0
+                loss = 0.0
+                pickle.dump(score_lst, open("score.p", "wb"))
+        </code></pre>
+        <h3>Reward Shaping</h3>
+        <p>We apply a transformation to the rewards to improve learning:</p>
+        <pre><code>
+    r = np.sign(r) * (np.sqrt(abs(r) + 1) - 1) + 0.001 * r
+        </code></pre>
+        <p>This transformation:</p>
+        <ul>
+            <li>Compresses large rewards while preserving their sign using square root</li>
+            <li>Adds a small linear component (0.001 * r) to maintain reward differences</li>
+            <li>Helps with reward scaling issues in environments with varying reward magnitudes</li>
+            <li>Makes the learning process more stable by bounding extreme rewards</li>
+        </ul>
+        <h3>Device Handling</h3>
+        <p>The code automatically detects and uses available hardware:</p>
+        <pre><code>
+    device = "cpu"
+    if torch.cuda.is_available():
+        device = "cuda"
+    elif torch.backends.mps.is_available():
+        device = "mps"
+        </code></pre>
+        <p>This ensures optimal performance across different systems while maintaining compatibility.</p>
+        <h3>Model Initialization</h3>
+        <pre><code>
+    q = model(n_frame, env.action_space.n, device).to(device)
+    q_target = model(n_frame, env.action_space.n, device).to(device)
+    optimizer = optim.Adam(q.parameters(), lr=0.0001)
+        </code></pre>
+        <h3>Running the Code</h3>
+        <p>To run the complete implementation:</p>
+        <ol>
+            <li>Install dependencies: <code>pip install gym-super-mario-bros nes_py torch</code></li>
+            <li>Create the <code>wrappers.py</code> file with environment wrappers</li>
+            <li>Run the script: <code>python duel_dqn.py</code></li>
+        </ol>
+        <h3>Expected Output</h3>
+        <p>During training, you should see output like:</p>
+        <pre><code>
+    cuda |Epoch : 0 | score : 125.400000 | loss : 0.00 | stage : 1 | time spent: 12.345678
+    cuda |Epoch : 10 | score : 256.800000 | loss : 1.23 | stage : 1 | time spent: 15.678901
+    cuda |Epoch : 20 | score : 512.100000 | loss : 0.87 | stage : 2 | time spent: 18.901234
+        </code></pre>
+        <h3>Model Checkpoints</h3>
+        <p>The training process automatically saves model checkpoints:</p>
+        <ul>
+            <li><code>mario_q.pth</code>: Main Q-network weights</li>
+            <li><code>mario_q_target.pth</code>: Target network weights</li>
+            <li><code>score.p</code>: Training scores for analysis</li>
+        </ul>
+        <div style="background-color: #f8f9fa; padding: 15px; border-radius: 5px; margin-top: 20px;">
+        <strong>Training Time:</strong> Training will take many hours or days to produce a competent agent.
+        You can monitor progress through the printed statistics and saved model checkpoints.
+        For faster results, consider using a pre-trained model or reducing the environment complexity.
+        </div>
+        """
+        text_edit = QTextEdit()
+        text_edit.setHtml(content)
+        text_edit.setReadOnly(True)
+        layout.addWidget(text_edit)
+        return widget
+    def create_exercises_tab(self):
+        widget = QWidget()
+        layout = QVBoxLayout(widget)
+        # Create a tab widget for different exercises
+        exercise_tabs = QTabWidget()
+        # Exercise 1: Replay Memory
+        exercise1 = CodeExerciseWidget(
+            "Exercise 1: Implement Replay Memory",
+            "Create a replay memory class that stores transitions and can sample random batches.",
+            """import random
+from collections import deque
+class ReplayMemory:
+    def __init__(self, capacity):
+        # TODO: Initialize memory with maximum capacity
+        pass
+    def push(self, transition):
+        # TODO: Add a transition to memory
+        pass
+    def sample(self, batch_size):
+        # TODO: Return a random sample of batch_size transitions
+        pass
+    def __len__(self):
+        # TODO: Return current size of memory
+        pass
+""",
+            """import random
+from collections import deque
+class ReplayMemory:
+    def __init__(self, capacity):
+        self.memory = deque(maxlen=capacity)
+    def push(self, transition):
+        self.memory.append(transition)
+    def sample(self, batch_size):
+        return random.sample(self.memory, batch_size)
+    def __len__(self):
+        return len(self.memory)
+""",
+            "Replay Memory"
+        )
+        exercise1.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise1, "Replay Memory")
+        # Exercise 2: Dueling DQN Model
+        exercise2 = CodeExerciseWidget(
+            "Exercise 2: Implement Dueling DQN Model",
+            "Create the neural network with separate value and advantage streams.",
+            """import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DuelingDQN(nn.Module):
+    def __init__(self, input_shape, num_actions):
+        super(DuelingDQN, self).__init__()
+        # TODO: Define convolutional layers
+        # TODO: Define value stream
+        # TODO: Define advantage stream
+    def forward(self, x):
+        # TODO: Implement forward pass with dueling architecture
+        pass
+""",
+            """import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class DuelingDQN(nn.Module):
+    def __init__(self, input_shape, num_actions):
+        super(DuelingDQN, self).__init__()
+        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4)
+        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
+        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
+        # Calculate size after convolutions
+        conv_size = self._get_conv_size(input_shape)
+        self.fc_adv = nn.Linear(conv_size, 512)
+        self.fc_val = nn.Linear(conv_size, 512)
+        self.advantage = nn.Linear(512, num_actions)
+        self.value = nn.Linear(512, 1)
+    def _get_conv_size(self, shape):
+        x = torch.zeros(1, *shape)
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        return x.view(1, -1).size(1)
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        x = x.view(x.size(0), -1)
+        adv = F.relu(self.fc_adv(x))
+        val = F.relu(self.fc_val(x))
+        adv = self.advantage(adv)
+        val = self.value(val).expand(x.size(0), adv.size(1))
+        # Combine using dueling formula
+        q = val + adv - adv.mean(1, keepdim=True).expand(x.size(0), adv.size(1))
+        return q
+""",
+            "Dueling DQN"
+        )
+        exercise2.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise2, "Dueling DQN")
+        # Exercise 3: Environment Wrapper
+        exercise3 = CodeExerciseWidget(
+            "Exercise 3: Environment Wrapper",
+            "Create a wrapper to preprocess the Super Mario Bros environment.",
+            """import gym
+import numpy as np
+from collections import deque
+class MarioEnvironmentWrapper:
+    def __init__(self, env, stack_frames=4):
+        self.env = env
+        self.stack_frames = stack_frames
+        # TODO: Initialize frame stack
+    def reset(self):
+        # TODO: Reset environment and initialize frame stack
+        pass
+    def step(self, action):
+        # TODO: Take step and update frame stack
+        pass
+    def _preprocess_frame(self, frame):
+        # TODO: Preprocess frame (grayscale, resize, normalize)
+        pass
+""",
+            """import gym
+import numpy as np
+from collections import deque
+import cv2
+class MarioEnvironmentWrapper:
+    def __init__(self, env, stack_frames=4):
+        self.env = env
+        self.stack_frames = stack_frames
+        self.frames = deque(maxlen=stack_frames)
+    def reset(self):
+        frame = self.env.reset()
+        frame = self._preprocess_frame(frame)
+        for _ in range(self.stack_frames):
+            self.frames.append(frame)
+        return np.array(self.frames)
+    def step(self, action):
+        next_frame, reward, done, info = self.env.step(action)
+        next_frame = self._preprocess_frame(next_frame)
+        self.frames.append(next_frame)
+        return np.array(self.frames), reward, done, info
+    def _preprocess_frame(self, frame):
+        # Convert to grayscale
+        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
+        # Resize
+        frame = cv2.resize(frame, (84, 84))
+        # Normalize
+        frame = frame / 255.0
+        return frame
+""",
+            "Environment Wrapper"
+        )
+        exercise3.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise3, "Environment Wrapper")
+        # Exercise 4: Training Loop
+        exercise4 = CodeExerciseWidget(
+            "Exercise 4: Training Loop Implementation",
+            "Implement the main training loop with epsilon-greedy exploration and target network updates.",
+            """def train_dqn(env, model, target_model, optimizer, device, episodes=1000):
+    memory = ReplayMemory(10000)
+    batch_size = 32
+    gamma = 0.99
+    epsilon = 1.0
+    epsilon_min = 0.01
+    epsilon_decay = 0.995
+    target_update = 10
+    scores = []
+    for episode in range(episodes):
+        state = env.reset()
+        total_reward = 0
+        done = False
+        while not done:
+            # TODO: Implement epsilon-greedy action selection
+            # TODO: Take action and store experience
+            # TODO: Train model if enough experiences
+            # TODO: Update target network periodically
+            pass
+        # TODO: Decay epsilon
+        # TODO: Log progress
+    return scores
+""",
+            """def train_dqn(env, model, target_model, optimizer, device, episodes=1000):
+    memory = ReplayMemory(10000)
+    batch_size = 32
+    gamma = 0.99
+    epsilon = 1.0
+    epsilon_min = 0.01
+    epsilon_decay = 0.995
+    target_update = 10
+    scores = []
+    for episode in range(episodes):
+        state = env.reset()
+        total_reward = 0
+        done = False
+        steps = 0
+        while not done:
+            # Epsilon-greedy action selection
+            if np.random.random() <= epsilon:
+                action = env.action_space.sample()
+            else:
+                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
+                q_values = model(state_tensor)
+                action = q_values.argmax().item()
+            # Take action
+            next_state, reward, done, info = env.step(action)
+            total_reward += reward
+            # Store experience
+            memory.push((state, action, reward, next_state, done))
+            state = next_state
+            # Train model if enough experiences
+            if len(memory) > batch_size:
+                batch = memory.sample(batch_size)
+                states, actions, rewards, next_states, dones = zip(*batch)
+                # Convert to tensors
+                states = torch.FloatTensor(np.array(states)).to(device)
+                actions = torch.LongTensor(actions).to(device)
+                rewards = torch.FloatTensor(rewards).to(device)
+                next_states = torch.FloatTensor(np.array(next_states)).to(device)
+                dones = torch.BoolTensor(dones).to(device)
+                # Compute Q-values
+                current_q = model(states).gather(1, actions.unsqueeze(1))
+                # Compute target Q-values
+                with torch.no_grad():
+                    next_actions = model(next_states).argmax(1)
+                    next_q = target_model(next_states).gather(1, next_actions.unsqueeze(1))
+                    target_q = rewards.unsqueeze(1) + gamma * next_q * (~dones).unsqueeze(1)
+                # Compute loss and update
+                loss = F.smooth_l1_loss(current_q, target_q)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+            steps += 1
+            # Update target network
+            if steps % target_update == 0:
+                target_model.load_state_dict(model.state_dict())
+        # Decay epsilon
+        epsilon = max(epsilon_min, epsilon * epsilon_decay)
+        scores.append(total_reward)
+        if episode % 100 == 0:
+            print(f"Episode {episode}, Score: {total_reward}, Epsilon: {epsilon:.3f}")
+    return scores
+""",
+            "Training Loop"
+        )
+        exercise4.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise4, "Training Loop")
+        # Exercise 5: Reward Shaping
+        exercise5 = CodeExerciseWidget(
+            "Exercise 5: Reward Shaping Function",
+            "Implement a reward transformation function to improve learning stability.",
+            """def transform_reward(reward, info, prev_info=None):
+    \"\"\"
+    Transform the raw reward to improve learning.
+    Args:
+        reward: Raw reward from environment
+        info: Current step info dictionary
+        prev_info: Previous step info (for computing deltas)
+    Returns:
+        Transformed reward value
+    \"\"\"
+    transformed_reward = reward
+    # TODO: Implement reward transformations:
+    # 1. Scale large rewards
+    # 2. Add small positive rewards for progress
+    # 3. Penalize time-wasting
+    # 4. Reward level completion
+    return transformed_reward
+""",
+            """def transform_reward(reward, info, prev_info=None):
+    \"\"\"
+    Transform the raw reward to improve learning.
+    Args:
+        reward: Raw reward from environment
+        info: Current step info dictionary
+        prev_info: Previous step info (for computing deltas)
+    Returns:
+        Transformed reward value
+    \"\"\"
+    transformed_reward = reward
+    # 1. Scale large rewards using signed log transform
+    if abs(reward) > 1:
+        transformed_reward = np.sign(reward) * (np.sqrt(abs(reward) + 1) - 1) + 0.001 * reward
+    # 2. Add small positive reward for x-position progress
+    if prev_info is not None:
+        x_pos = info.get('x_pos', 0)
+        prev_x_pos = prev_info.get('x_pos', 0)
+        x_progress = x_pos - prev_x_pos
+        # Reward moving right, penalize moving left
+        if x_progress > 0:
+            transformed_reward += 0.1
+        elif x_progress < -1:  # Small left movements might be okay
+            transformed_reward -= 0.2
+    # 3. Penalize time-wasting (standing still too long)
+    time_penalty = -0.01
+    transformed_reward += time_penalty
+    # 4. Large reward for completing level
+    if info.get('flag_get', False):
+        transformed_reward += 100
+    # 5. Reward coin collection
+    if prev_info is not None:
+        coins = info.get('coins', 0)
+        prev_coins = prev_info.get('coins', 0)
+        if coins > prev_coins:
+            transformed_reward += 1.0
+    # 6. Reward defeating enemies
+    if prev_info is not None:
+        score = info.get('score', 0)
+        prev_score = prev_info.get('score', 0)
+        if score > prev_score and reward == 0:
+            # Enemy defeated (score increased but no immediate reward)
+            transformed_reward += 0.5
+    # Clip reward to reasonable range
+    transformed_reward = np.clip(transformed_reward, -5, 10)
+    return transformed_reward
+# Alternative: The reward transformation from the original code
+def original_reward_transform(reward):
+    \"\"\"The reward transformation used in the original duel_dqn.py\"\"\"
+    return np.sign(reward) * (np.sqrt(abs(reward) + 1) - 1) + 0.001 * reward
+""",
+            "Reward Shaping"
+        )
+        exercise5.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise5, "Reward Shaping")
+        # Exercise 6: Model Saving/Loading
+        exercise6 = CodeExerciseWidget(
+            "Exercise 6: Model Saving and Loading",
+            "Implement functions to save and load trained models with proper error handling.",
+            """import torch
+import os
+class ModelManager:
+    def __init__(self, model_dir='models'):
+        self.model_dir = model_dir
+        os.makedirs(model_dir, exist_ok=True)
+    def save_checkpoint(self, model, target_model, optimizer, episode, loss, filename):
+        \"\"\"
+        Save training checkpoint including model weights and training state.
+        \"\"\"
+        # TODO: Implement checkpoint saving
+        pass
+    def load_checkpoint(self, filename, model, target_model=None, optimizer=None):
+        \"\"\"
+        Load training checkpoint and restore state.
+        \"\"\"
+        # TODO: Implement checkpoint loading
+        pass
+    def save_best_model(self, model, score, filename):
+        \"\"\"
+        Save the best model based on validation score.
+        \"\"\"
+        # TODO: Implement best model saving
+        pass
+""",
+            """import torch
+import os
+import json
+class ModelManager:
+    def __init__(self, model_dir='models'):
+        self.model_dir = model_dir
+        os.makedirs(model_dir, exist_ok=True)
+    def save_checkpoint(self, model, target_model, optimizer, episode, loss, scores, filename):
+        \"\"\"
+        Save training checkpoint including model weights and training state.
+        \"\"\"
+        checkpoint = {
+            'episode': episode,
+            'model_state_dict': model.state_dict(),
+            'target_model_state_dict': target_model.state_dict(),
+            'optimizer_state_dict': optimizer.state_dict(),
+            'loss': loss,
+            'scores': scores,
+            'model_architecture': str(model)
+        }
+        filepath = os.path.join(self.model_dir, filename)
+        torch.save(checkpoint, filepath)
+        # Also save metadata as JSON
+        metadata = {
+            'episode': episode,
+            'loss': loss,
+            'final_score': scores[-1] if scores else 0,
+            'timestamp': time.time()
+        }
+        metadata_path = os.path.join(self.model_dir, f"{filename}_metadata.json")
+        with open(metadata_path, 'w') as f:
+            json.dump(metadata, f, indent=2)
+        print(f"Checkpoint saved: {filepath}")
+    def load_checkpoint(self, filename, model, target_model=None, optimizer=None):
+        \"\"\"
+        Load training checkpoint and restore state.
+        Returns:
+            dict: Checkpoint data including episode and scores
+        \"\"\"
+        filepath = os.path.join(self.model_dir, filename)
+        if not os.path.exists(filepath):
+            raise FileNotFoundError(f"Checkpoint file not found: {filepath}")
+        checkpoint = torch.load(filepath, map_location='cpu')
+        # Load model weights
+        model.load_state_dict(checkpoint['model_state_dict'])
+        # Load target model weights if provided
+        if target_model is not None and 'target_model_state_dict' in checkpoint:
+            target_model.load_state_dict(checkpoint['target_model_state_dict'])
+        # Load optimizer state if provided
+        if optimizer is not None and 'optimizer_state_dict' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        print(f"Checkpoint loaded: {filepath}")
+        print(f"Resuming from episode {checkpoint['episode']}")
+        return {
+            'episode': checkpoint['episode'],
+            'loss': checkpoint.get('loss', 0),
+            'scores': checkpoint.get('scores', [])
+        }
+    def save_best_model(self, model, score, filename='best_model.pth'):
+        \"\"\"
+        Save the best model based on validation score.
+        \"\"\"
+        filepath = os.path.join(self.model_dir, filename)
+        # Save model state
+        torch.save(model.state_dict(), filepath)
+        # Save score information
+        score_info = {
+            'score': score,
+            'timestamp': time.time()
+        }
+        score_path = os.path.join(self.model_dir, f"{filename}_score.json")
+        with open(score_path, 'w') as f:
+            json.dump(score_info, f, indent=2)
+        print(f"Best model saved with score {score}: {filepath}")
+    def list_checkpoints(self):
+        \"\"\"List all available checkpoints.\"\"\"
+        checkpoints = []
+        for file in os.listdir(self.model_dir):
+            if file.endswith('.pth') and not file.endswith('_metadata.json'):
+                checkpoints.append(file)
+        return sorted(checkpoints)
+# Usage example:
+def setup_model_persistence():
+    \"\"\"Example of how to use the ModelManager\"\"\"
+    manager = ModelManager()
+    # Example saving
+    # manager.save_checkpoint(q, q_target, optimizer, episode, loss, scores, 'checkpoint_1000.pth')
+    # Example loading
+    # checkpoint_data = manager.load_checkpoint('checkpoint_1000.pth', q, q_target, optimizer)
+    # start_episode = checkpoint_data['episode'] + 1
+    return manager
+""",
+            "Model Saving"
+        )
+        exercise6.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise6, "Model Saving")
+        # Exercise 7: Hyperparameter Tuning
+        exercise7 = CodeExerciseWidget(
+            "Exercise 7: Hyperparameter Configuration System",
+            "Create a flexible configuration system for managing hyperparameters.",
+            """class DQNConfig:
+    \"\"\"Configuration class for DQN hyperparameters.\"\"\"
+    def __init__(self):
+        # TODO: Define all hyperparameters with default values
+        pass
+    def from_dict(self, config_dict):
+        \"\"\"Update configuration from dictionary.\"\"\"
+        # TODO: Implement dictionary-based configuration
+        pass
+    def to_dict(self):
+        \"\"\"Convert configuration to dictionary.\"\"\"
+        # TODO: Implement conversion to dictionary
+        pass
+    def validate(self):
+        \"\"\"Validate configuration parameters.\"\"\"
+        # TODO: Implement validation logic
+        pass
+""",
+            """import json
+import yaml
+from dataclasses import dataclass, asdict
+from typing import List, Optional
+@dataclass
+class DQNConfig:
+    \"\"\"Configuration class for DQN hyperparameters.\"\"\"
+    # Environment settings
+    env_name: str = "SuperMarioBros-v0"
+    action_space: str = "COMPLEX_MOVEMENT"
+    frame_stack: int = 4
+    frame_skip: int = 4
+    # Training hyperparameters
+    learning_rate: float = 0.0001
+    gamma: float = 0.99
+    batch_size: int = 32
+    buffer_size: int = 10000
+    episodes: int = 10000
+    # Exploration settings
+    epsilon_start: float = 1.0
+    epsilon_end: float = 0.01
+    epsilon_decay: float = 0.995
+    epsilon_decay_steps: int = 10000
+    # Network architecture
+    conv_filters: List[int] = None
+    conv_kernel_sizes: List[int] = None
+    conv_strides: List[int] = None
+    hidden_sizes: List[int] = None
+    # Training schedule
+    learning_starts: int = 1000
+    target_update_frequency: int = 1000
+    train_frequency: int = 4
+    # Reward shaping
+    reward_scale: float = 1.0
+    reward_clip: float = 10.0
+    use_reward_shaping: bool = True
+    # Device and logging
+    device: str = "auto"
+    save_interval: int = 100
+    log_interval: int = 10
+    eval_interval: int = 100
+    def __post_init__(self):
+        \"\"\"Set default values for lists.\"\"\"
+        if self.conv_filters is None:
+            self.conv_filters = [32, 64, 64]
+        if self.conv_kernel_sizes is None:
+            self.conv_kernel_sizes = [8, 4, 3]
+        if self.conv_strides is None:
+            self.conv_strides = [4, 2, 1]
+        if self.hidden_sizes is None:
+            self.hidden_sizes = [512]
+    @classmethod
+    def from_dict(cls, config_dict):
+        \"\"\"Create configuration from dictionary.\"\"\"
+        return cls(**config_dict)
+    def to_dict(self):
+        \"\"\"Convert configuration to dictionary.\"\"\"
+        return asdict(self)
+    def save(self, filename):
+        \"\"\"Save configuration to JSON file.\"\"\"
+        with open(filename, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+    @classmethod
+    def load(cls, filename):
+        \"\"\"Load configuration from JSON file.\"\"\"
+        with open(filename, 'r') as f:
+            config_dict = json.load(f)
+        return cls.from_dict(config_dict)
+    def validate(self):
+        \"\"\"Validate configuration parameters.\"\"\"
+        assert self.learning_rate > 0, "Learning rate must be positive"
+        assert 0 <= self.gamma <= 1, "Gamma must be between 0 and 1"
+        assert self.batch_size > 0, "Batch size must be positive"
+        assert self.buffer_size >= self.batch_size, "Buffer size must be >= batch size"
+        assert 0 <= self.epsilon_end <= self.epsilon_start <= 1, "Invalid epsilon values"
+        assert len(self.conv_filters) == len(self.conv_kernel_sizes) == len(self.conv_strides), \
+               "Conv configuration lists must have same length"
+# Example configurations for different scenarios
+def get_fast_config():
+    \"\"\"Configuration for fast training (lower quality).\"\"\"
+    return DQNConfig(
+        batch_size=16,
+        buffer_size=5000,
+        episodes=2000,
+        epsilon_decay=0.99,
+        target_update_frequency=500,
+        save_interval=50
+    )
+def get_high_quality_config():
+    \"\"\"Configuration for high-quality training (slower).\"\"\"
+    return DQNConfig(
+        batch_size=64,
+        buffer_size=50000,
+        episodes=50000,
+        epsilon_decay=0.999,
+        target_update_frequency=1000,
+        conv_filters=[64, 128, 128],
+        hidden_sizes=[512, 256]
+    )
+def get_debug_config():
+    \"\"\"Configuration for debugging.\"\"\"
+    return DQNConfig(
+        episodes=100,
+        batch_size=8,
+        buffer_size=1000,
+        save_interval=10,
+        log_interval=1
+    )
+# Usage example:
+def setup_training_with_config():
+    config = DQNConfig()
+    config.validate()
+    # Save config
+    config.save('training_config.json')
+    # Load config
+    # loaded_config = DQNConfig.load('training_config.json')
+    return config
+""",
+            "Hyperparameter Tuning"
+        )
+        exercise7.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise7, "Hyperparameter Tuning")
+        # Exercise 8: Evaluation Metrics
+        exercise8 = CodeExerciseWidget(
+            "Exercise 8: Evaluation and Metrics System",
+            "Implement comprehensive evaluation metrics and visualization tools.",
+            """import matplotlib.pyplot as plt
+import numpy as np
+from collections import deque
+class TrainingMetrics:
+    def __init__(self, window_size=100):
+        # TODO: Initialize metric trackers
+        pass
+    def update(self, episode, score, loss, epsilon, steps):
+        # TODO: Update all metrics
+        pass
+    def get_recent_scores(self):
+        # TODO: Return recent scores for plotting
+        pass
+    def plot_training_progress(self):
+        # TODO: Create training progress visualization
+        pass
+    def generate_report(self):
+        # TODO: Generate training summary report
+        pass
+def evaluate_agent(env, model, episodes=10, render=False):
+    \"\"\"
+    Evaluate the trained agent on multiple episodes.
+    \"\"\"
+    # TODO: Implement agent evaluation
+    pass
+""",
+            """import matplotlib.pyplot as plt
+import numpy as np
+from collections import deque
+import pandas as pd
+import seaborn as sns
+from typing import Dict, List, Tuple
+class TrainingMetrics:
+    def __init__(self, window_size=100):
+        self.window_size = window_size
+        # Metrics storage
+        self.episode_scores = []
+        self.episode_losses = []
+        self.episode_steps = []
+        self.episode_epsilons = []
+        self.episode_times = []
+        # Rolling averages
+        self.recent_scores = deque(maxlen=window_size)
+        self.recent_losses = deque(maxlen=window_size)
+        # Best performance tracking
+        self.best_score = -np.inf
+        self.best_episode = 0
+    def update(self, episode, score, loss, epsilon, steps, time_taken):
+        \"\"\"Update all metrics with new episode data.\"\"\"
+        self.episode_scores.append(score)
+        self.episode_losses.append(loss)
+        self.episode_epsilons.append(epsilon)
+        self.episode_steps.append(steps)
+        self.episode_times.append(time_taken)
+        self.recent_scores.append(score)
+        self.recent_losses.append(loss)
+        # Update best score
+        if score > self.best_score:
+            self.best_score = score
+            self.best_episode = episode
+    def get_recent_scores(self) -> List[float]:
+        \"\"\"Return recent scores for plotting.\"\"\"
+        return list(self.recent_scores)
+    def get_moving_averages(self) -> Dict[str, float]:
+        \"\"\"Calculate moving averages of key metrics.\"\"\"
+        if len(self.recent_scores) == 0:
+            return {}
+        return {
+            'score_ma': np.mean(self.recent_scores),
+            'loss_ma': np.mean(self.recent_losses),
+            'score_std': np.std(self.recent_scores),
+            'loss_std': np.std(self.recent_losses)
+        }
+    def plot_training_progress(self, save_path=None):
+        \"\"\"Create comprehensive training progress visualization.\"\"\"
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
+        episodes = range(len(self.episode_scores))
+        # Plot 1: Scores
+        ax1.plot(episodes, self.episode_scores, 'b-', alpha=0.3, label='Raw Scores')
+        if len(episodes) >= self.window_size:
+            moving_avg = pd.Series(self.episode_scores).rolling(self.window_size).mean()
+            ax1.plot(episodes, moving_avg, 'r-', linewidth=2, label=f'Moving Avg ({self.window_size})')
+        ax1.axhline(y=self.best_score, color='g', linestyle='--', label=f'Best: {self.best_score:.1f}')
+        ax1.set_xlabel('Episode')
+        ax1.set_ylabel('Score')
+        ax1.set_title('Training Scores')
+        ax1.legend()
+        ax1.grid(True, alpha=0.3)
+        # Plot 2: Loss
+        ax2.plot(episodes, self.episode_losses, 'r-', alpha=0.3)
+        if len(episodes) >= self.window_size:
+            moving_avg_loss = pd.Series(self.episode_losses).rolling(self.window_size).mean()
+            ax2.plot(episodes, moving_avg_loss, 'black', linewidth=2)
+        ax2.set_xlabel('Episode')
+        ax2.set_ylabel('Loss')
+        ax2.set_title('Training Loss')
+        ax2.grid(True, alpha=0.3)
+        # Plot 3: Epsilon
+        ax3.plot(episodes, self.episode_epsilons, 'g-')
+        ax3.set_xlabel('Episode')
+        ax3.set_ylabel('Epsilon')
+        ax3.set_title('Exploration Rate')
+        ax3.grid(True, alpha=0.3)
+        # Plot 4: Steps per episode
+        ax4.plot(episodes, self.episode_steps, 'purple', alpha=0.3)
+        if len(episodes) >= self.window_size:
+            moving_avg_steps = pd.Series(self.episode_steps).rolling(self.window_size).mean()
+            ax4.plot(episodes, moving_avg_steps, 'black', linewidth=2)
+        ax4.set_xlabel('Episode')
+        ax4.set_ylabel('Steps')
+        ax4.set_title('Steps per Episode')
+        ax4.grid(True, alpha=0.3)
+        plt.tight_layout()
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            print(f"Training plot saved to {save_path}")
+        plt.show()
+    def generate_report(self) -> Dict:
+        \"\"\"Generate comprehensive training summary report.\"\"\"
+        if not self.episode_scores:
+            return {}
+        ma = self.get_moving_averages()
+        report = {
+            'total_episodes': len(self.episode_scores),
+            'best_score': self.best_score,
+            'best_episode': self.best_episode,
+            'final_score': self.episode_scores[-1],
+            'average_score': np.mean(self.episode_scores),
+            'median_score': np.median(self.episode_scores),
+            'std_score': np.std(self.episode_scores),
+            'average_loss': np.mean(self.episode_losses),
+            'average_steps': np.mean(self.episode_steps),
+            'total_training_time': np.sum(self.episode_times),
+            'recent_score_ma': ma.get('score_ma', 0),
+            'recent_loss_ma': ma.get('loss_ma', 0),
+            'recent_score_std': ma.get('score_std', 0)
+        }
+        return report
+    def save_metrics(self, filename):
+        \"\"\"Save metrics to file.\"\"\"
+        metrics_data = {
+            'episode_scores': self.episode_scores,
+            'episode_losses': self.episode_losses,
+            'episode_epsilons': self.episode_epsilons,
+            'episode_steps': self.episode_steps,
+            'episode_times': self.episode_times,
+            'best_score': self.best_score,
+            'best_episode': self.best_episode
+        }
+        np.savez(filename, **metrics_data)
+        print(f"Metrics saved to {filename}")
+def evaluate_agent(env, model, device, episodes=10, render=False, epsilon=0.01):
+    \"\"\"
+    Evaluate the trained agent on multiple episodes.
+    Returns:
+        dict: Evaluation metrics
+    \"\"\"
+    model.eval()
+    episode_scores = []
+    episode_steps = []
+    episode_infos = []
+    for episode in range(episodes):
+        state = env.reset()
+        total_reward = 0
+        steps = 0
+        done = False
+        episode_info = {}
+        while not done:
+            if render:
+                env.render()
+            # Epsilon-greedy evaluation (small epsilon for evaluation)
+            if np.random.random() < epsilon:
+                action = env.action_space.sample()
+            else:
+                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
+                with torch.no_grad():
+                    q_values = model(state_tensor)
+                action = q_values.argmax().item()
+            next_state, reward, done, info = env.step(action)
+            total_reward += reward
+            state = next_state
+            steps += 1
+            # Store final info
+            if done:
+                episode_info = info.copy()
+        episode_scores.append(total_reward)
+        episode_steps.append(steps)
+        episode_infos.append(episode_info)
+        print(f"Evaluation Episode {episode + 1}/{episodes}: "
+              f"Score: {total_reward:.1f}, Steps: {steps}")
+    model.train()
+    # Calculate evaluation metrics
+    metrics = {
+        'mean_score': np.mean(episode_scores),
+        'std_score': np.std(episode_scores),
+        'median_score': np.median(episode_scores),
+        'max_score': np.max(episode_scores),
+        'min_score': np.min(episode_scores),
+        'mean_steps': np.mean(episode_steps),
+        'completion_rate': sum(1 for info in episode_infos if info.get('flag_get', False)) / episodes,
+        'average_coins': np.mean([info.get('coins', 0) for info in episode_infos]),
+        'average_x_pos': np.mean([info.get('x_pos', 0) for info in episode_infos])
+    }
+    print(f"\\nEvaluation Results ({episodes} episodes):")
+    print(f"Average Score: {metrics['mean_score']:.2f} ± {metrics['std_score']:.2f}")
+    print(f"Completion Rate: {metrics['completion_rate']:.1%}")
+    print(f"Average Coins: {metrics['average_coins']:.1f}")
+    print(f"Average Final X Position: {metrics['average_x_pos']:.1f}")
+    return metrics
+# Usage example:
+def setup_evaluation_system():
+    metrics = TrainingMetrics(window_size=100)
+    # Example usage during training:
+    # for episode in range(episodes):
+    #     score, loss, steps, epsilon, time_taken = run_episode(...)
+    #     metrics.update(episode, score, loss, epsilon, steps, time_taken)
+    #
+    #     if episode % 100 == 0:
+    #         metrics.plot_training_progress()
+    #         report = metrics.generate_report()
+    return metrics
+""",
+            "Evaluation"
+        )
+        exercise8.exerciseCompleted.connect(self.on_exercise_completed)
+        exercise_tabs.addTab(exercise8, "Evaluation")
+        layout.addWidget(exercise_tabs)
+        return widget
+    def change_content(self, index):
+        self.content_tabs.setCurrentIndex(index)
+    def on_exercise_completed(self, exercise_name, completed):
+        if completed:
+            self.exercise_completion[exercise_name] = True
+            self.update_progress()
+    def update_progress(self):
+        # Update section completion from checkboxes
+        for section, widget in self.section_widgets.items():
+            self.section_completion[section] = widget.is_completed
+        # Calculate progress
+        completed_sections = sum(self.section_completion.values())
+        completed_exercises = sum(self.exercise_completion.values())
+        total_sections = len(self.section_completion)
+        total_exercises = len(self.exercise_completion)
+        # Calculate overall progress (50% sections, 50% exercises)
+        section_progress = (completed_sections / total_sections) * 50
+        exercise_progress = (completed_exercises / total_exercises) * 50
+        overall_progress = section_progress + exercise_progress
+        # Animate progress bar
+        current_value = self.progress_bar.value()
+        if overall_progress > current_value:
+            # Animate progress increase
+            timer = QTimer(self)
+            timer.timeout.connect(lambda: self.animate_progress(current_value, overall_progress, timer))
+            timer.start(50)
+        else:
+            self.progress_bar.setValue(int(overall_progress))
+        # Update UI
+        self.exercise_count.setText(f"Exercises: {completed_exercises}/{total_exercises} completed")
+        self.section_count.setText(f"Sections: {completed_sections}/{total_sections} completed")
+        # Update list widget with completion indicators
+        for i in range(self.nav_list.count()):
+            item = self.nav_list.item(i)
+            section_name = item.text().replace('✓ ', '')
+            if self.section_completion.get(section_name, False):
+                item.setText(f"✓ {section_name}")
+                item.setBackground(QColor(40, 167, 69, 50))  # Light green background
+            else:
+                item.setText(section_name)
+                item.setBackground(QColor(255, 255, 255))  # White background
+        # Show celebration message when all completed
+        if completed_sections == total_sections and completed_exercises == total_exercises:
+            self.show_completion_celebration()
+    def animate_progress(self, start, end, timer):
+        current = self.progress_bar.value() + 2
+        if current >= end:
+            self.progress_bar.setValue(int(end))
+            timer.stop()
+        else:
+            self.progress_bar.setValue(current)
+    def show_completion_celebration(self):
+        msg = QMessageBox(self)
+        msg.setWindowTitle("🎉 Congratulations!")
+        msg.setText("""
+        <h2>Amazing Achievement! 🏆</h2>
+        <p>You've successfully completed the entire Dueling DQN tutorial!</p>
+        <p><strong>You now have:</strong></p>
+        <ul>
+            <li>Solid understanding of Dueling DQN architecture</li>
+            <li>Hands-on experience building reinforcement learning systems</li>
+            <li>Working knowledge of PyTorch for deep RL</li>
+            <li>Complete implementation of an AI that can play Super Mario Bros!</li>
+        </ul>
+        <p>Keep exploring and building amazing AI projects! 🚀</p>
+        """)
+        msg.setIcon(QMessageBox.Information)
+        msg.exec_()
+    def apply_styling(self):
+        # Enhanced styling
+        self.setStyleSheet("""
+            QMainWindow {
+                background: qlineargradient(x1: 0, y1: 0, x2: 1, y2: 1,
+                                          stop: 0 #f8f9fa, stop: 1 #e9ecef);
+                font-family: 'Segoe UI', Arial, sans-serif;
+            }
+            QWidget {
+                background: transparent;
+                color: #2c3e50;
+            }
+            /* Navigation Sidebar */
+            QListWidget {
+                background: white;
+                border: 1px solid #dee2e6;
+                border-radius: 8px;
+                outline: none;
+                font-size: 14px;
+            }
+            QListWidget::item {
+                background: white;
+                border-bottom: 1px solid #f8f9fa;
+                padding: 12px 15px;
+                color: #495057;
+            }
+            QListWidget::item:selected {
+                background: qlineargradient(x1: 0, y1: 0, x2: 1, y2: 0,
+                                          stop: 0 #007bff, stop: 1 #0056b3);
+                color: white;
+                font-weight: bold;
+                border-radius: 4px;
+            }
+            QListWidget::item:hover:!selected {
+                background: #e9ecef;
+                border-radius: 4px;
+            }
+            /* Progress Section */
+            QGroupBox {
+                background: white;
+                border: 1px solid #dee2e6;
+                border-radius: 8px;
+                margin-top: 10px;
+                padding-top: 15px;
+                font-weight: bold;
+                color: #495057;
+            }
+            QGroupBox::title {
+                subcontrol-origin: margin;
+                left: 10px;
+                padding: 0 8px 0 8px;
+                background: white;
+                color: #495057;
+            }
+            QProgressBar {
+                border: 1px solid #ced4da;
+                border-radius: 4px;
+                text-align: center;
+                background: #e9ecef;
+                color: #495057;
+            }
+            QProgressBar::chunk {
+                background: qlineargradient(x1: 0, y1: 0, x2: 1, y2: 0,
+                                          stop: 0 #28a745, stop: 1 #20c997);
+                border-radius: 3px;
+            }
+            /* Tabs */
+            QTabWidget::pane {
+                border: 1px solid #dee2e6;
+                border-radius: 8px;
+                background: white;
+            }
+            QTabBar::tab {
+                background: #f8f9fa;
+                border: 1px solid #dee2e6;
+                border-bottom: none;
+                padding: 8px 16px;
+                margin-right: 2px;
+                border-top-left-radius: 4px;
+                border-top-right-radius: 4px;
+                color: #6c757d;
+            }
+            QTabBar::tab:selected {
+                background: white;
+                border-bottom: 2px solid #007bff;
+                color: #007bff;
+                font-weight: bold;
+            }
+            QTabBar::tab:hover:!selected {
+                background: #e9ecef;
+                color: #495057;
+            }
+            /* Buttons */
+            QPushButton {
+                background: qlineargradient(x1: 0, y1: 0, x2: 0, y2: 1,
+                                          stop: 0 #007bff, stop: 1 #0056b3);
+                color: white;
+                border: none;
+                padding: 8px 16px;
+                border-radius: 4px;
+                font-weight: bold;
+            }
+            QPushButton:hover {
+                background: qlineargradient(x1: 0, y1: 0, x2: 0, y2: 1,
+                                          stop: 0 #0056b3, stop: 1 #004085);
+            }
+            QPushButton:pressed {
+                background: #004085;
+            }
+            /* Text areas */
+            QTextEdit {
+                background: white;
+                border: 1px solid #dee2e6;
+                border-radius: 5px;
+                padding: 10px;
+                selection-background-color: #007bff;
+            }
+            /* Checkboxes */
+            QCheckBox {
+                spacing: 8px;
+                color: #495057;
+            }
+            QCheckBox::indicator {
+                width: 16px;
+                height: 16px;
+                border: 2px solid #adb5bd;
+                border-radius: 3px;
+                background: white;
+            }
+            QCheckBox::indicator:checked {
+                background: #007bff;
+                border: 2px solid #007bff;
+            }
+            QCheckBox::indicator:checked:hover {
+                background: #0056b3;
+                border: 2px solid #0056b3;
+            }
+        """)
+if __name__ == "__main__":
+    app = QApplication(sys.argv)
+    # Set application-wide font
+    font = QFont("Segoe UI", 10)
+    app.setFont(font)
+    window = DuelingDQNTutorialApp()
+    window.show()
+    sys.exit(app.exec_())

pyqt5_duel_dqn_super_mario_bros_tutorial/flux_krea_00776_.png ADDED Viewed

Git LFS Details

SHA256: 9b38502348cc00c0ea8949e1165b55bee8ec9c8ec35fc8d9143fa571b5a49e98
Pointer size: 132 Bytes
Size of remote file: 1.37 MB

pyqt5_duel_dqn_super_mario_bros_tutorial/installed_packages_dqn.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+Markdown==3.10
+PyQt5==5.15.11
+PyQt5-Qt5==5.15.17
+PyQt5_sip==12.17.1