Spaces:

Naz786
/

Q-Learning-Sensor-Placement

Sleeping

App Files Files Community

Naz786 commited on Jan 5

Commit

9c74b9c

verified ·

1 Parent(s): 25f411f

Create app.py

Browse files

Files changed (1) hide show

app.py +758 -0

app.py ADDED Viewed

	@@ -0,0 +1,758 @@

+"""
+Q-Learning AI for Sensor Placement - Interactive Demo
+For Hugging Face Spaces
+"""
+import numpy as np
+import matplotlib.pyplot as plt
+import gradio as gr
+from collections import defaultdict
+np.random.seed(42)
+# ==============================================================================
+# PART 1: THE SECRET WORLD
+# ==============================================================================
+class ThiefWorld:
+    """Where thieves REALLY appear (AI must discover this!)"""
+    def __init__(self, hotspot1=2.5, hotspot2=7.0):
+        self.hotspot1 = hotspot1
+        self.hotspot2 = hotspot2
+        self.n_zones = 10
+    def get_thief_probability(self, zone):
+        zone_center = zone + 0.5
+        prob = (
+            0.6 * np.exp(-((zone_center - self.hotspot1)**2) / 1.0) +
+            0.4 * np.exp(-((zone_center - self.hotspot2)**2) / 0.8) +
+            0.05
+        )
+        return min(prob, 1.0)
+    def generate_thieves(self):
+        thieves = np.zeros(self.n_zones)
+        for zone in range(self.n_zones):
+            if np.random.random() < self.get_thief_probability(zone):
+                thieves[zone] = 1
+        return thieves
+# ==============================================================================
+# PART 2: SENSOR
+# ==============================================================================
+class Sensor:
+    def __init__(self, catch_probability=0.9):
+        self.catch_prob = catch_probability
+    def try_catch(self, thief_present):
+        if thief_present:
+            return np.random.random() < self.catch_prob
+        return False
+# ==============================================================================
+# PART 3: ENVIRONMENT
+# ==============================================================================
+class SensorPlacementEnv:
+    def __init__(self, n_sensors=4, hotspot1=2.5, hotspot2=7.0):
+        self.world = ThiefWorld(hotspot1, hotspot2)
+        self.sensor = Sensor()
+        self.n_sensors = n_sensors
+        self.n_zones = 10
+        self.reset()
+    def reset(self):
+        self.zone_attempts = np.zeros(self.n_zones)
+        self.zone_catches = np.zeros(self.n_zones)
+        self.day = 0
+        self.total_caught = 0
+        self.total_thieves = 0
+        return self._get_state()
+    def _get_state(self):
+        if self.zone_attempts.sum() == 0:
+            return (0, 0)
+        most_tried = int(np.argmax(self.zone_attempts))
+        catch_rates = np.zeros(self.n_zones)
+        for z in range(self.n_zones):
+            if self.zone_attempts[z] > 0:
+                catch_rates[z] = self.zone_catches[z] / self.zone_attempts[z]
+        best_zone = int(np.argmax(catch_rates))
+        return (most_tried, best_zone)
+    def step(self, action):
+        thieves = self.world.generate_thieves()
+        n_thieves = int(thieves.sum())
+        self.total_thieves += n_thieves
+        caught = 0
+        for zone in action:
+            if zone < self.n_zones:
+                self.zone_attempts[zone] += 1
+                if thieves[zone] == 1:
+                    if self.sensor.try_catch(True):
+                        caught += 1
+                        self.zone_catches[zone] += 1
+        self.total_caught += caught
+        self.day += 1
+        reward = caught + 0.1 * len(set(action))
+        done = self.day >= 30
+        return self._get_state(), reward, done, {'caught': caught}
+# ==============================================================================
+# PART 4: Q-LEARNING AGENT
+# ==============================================================================
+class QLearningAgent:
+    def __init__(self):
+        self.q_table = defaultdict(lambda: defaultdict(float))
+        self.learning_rate = 0.1
+        self.discount_factor = 0.95
+        self.epsilon = 1.0
+        self.epsilon_decay = 0.995
+        self.epsilon_min = 0.01
+    def _get_possible_actions(self):
+        return [
+            (1, 3, 6, 8), (0, 3, 6, 9), (2, 4, 6, 8),
+            (0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5),
+            (5, 6, 7, 8), (6, 7, 8, 9), (4, 5, 6, 7),
+            (2, 3, 7, 8), (1, 2, 6, 7), (2, 3, 6, 7),
+            (3, 4, 5, 6), (0, 2, 5, 9), (1, 4, 7, 9),
+        ]
+    def choose_action(self, state):
+        actions = self._get_possible_actions()
+        if np.random.random() < self.epsilon:
+            return actions[np.random.randint(len(actions))]
+        else:
+            best_action = None
+            best_value = -999999
+            for action in actions:
+                value = self.q_table[state][action]
+                if value > best_value:
+                    best_value = value
+                    best_action = action
+            if best_action is None:
+                best_action = actions[np.random.randint(len(actions))]
+            return best_action
+    def learn(self, state, action, reward, next_state, done):
+        old_q = self.q_table[state][action]
+        if done:
+            max_future_q = 0
+        else:
+            actions = self._get_possible_actions()
+            max_future_q = max([self.q_table[next_state][a] for a in actions])
+        target = reward + self.discount_factor * max_future_q
+        new_q = old_q + self.learning_rate * (target - old_q)
+        self.q_table[state][action] = new_q
+    def decay_epsilon(self):
+        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+# ==============================================================================
+# TRAINING AND TESTING FUNCTIONS
+# ==============================================================================
+def train_and_test(n_episodes, hotspot1, hotspot2, progress=gr.Progress()):
+    """Train AI and compare with other strategies."""
+    np.random.seed(42)
+    # Training
+    env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
+    agent = QLearningAgent()
+    episode_rewards = []
+    episode_catch_rates = []
+    epsilon_history = []
+    for episode in progress.tqdm(range(n_episodes), desc="Training AI"):
+        state = env.reset()
+        total_reward = 0
+        for day in range(30):
+            action = agent.choose_action(state)
+            next_state, reward, done, _ = env.step(action)
+            agent.learn(state, action, reward, next_state, done)
+            state = next_state
+            total_reward += reward
+            if done:
+                break
+        agent.decay_epsilon()
+        episode_rewards.append(total_reward)
+        catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
+        episode_catch_rates.append(catch_rate)
+        epsilon_history.append(agent.epsilon)
+    # Testing
+    n_tests = 50
+    results = {}
+    # Q-Learning AI
+    agent.epsilon = 0
+    catches = []
+    for _ in range(n_tests):
+        state = env.reset()
+        for day in range(30):
+            action = agent.choose_action(state)
+            state, _, done, _ = env.step(action)
+            if done:
+                break
+        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
+    results['Q-Learning AI'] = np.mean(catches)
+    # Random
+    catches = []
+    for _ in range(n_tests):
+        env.reset()
+        for day in range(30):
+            action = tuple(np.random.choice(10, 4, replace=False))
+            _, _, done, _ = env.step(action)
+            if done:
+                break
+        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
+    results['Random'] = np.mean(catches)
+    # Static
+    catches = []
+    for _ in range(n_tests):
+        env.reset()
+        for day in range(30):
+            _, _, done, _ = env.step((1, 3, 6, 8))
+            if done:
+                break
+        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
+    results['Static Uniform'] = np.mean(catches)
+    # Perfect
+    h1_zone = int(hotspot1)
+    h2_zone = int(hotspot2)
+    perfect_action = (h1_zone, h1_zone+1, h2_zone, h2_zone+1)
+    perfect_action = tuple(min(z, 9) for z in perfect_action)
+    catches = []
+    for _ in range(n_tests):
+        env.reset()
+        for day in range(30):
+            _, _, done, _ = env.step(perfect_action)
+            if done:
+                break
+        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
+    results['Perfect (Cheating)'] = np.mean(catches)
+    # Create plots
+    fig = plt.figure(figsize=(16, 12))
+    # Plot 1: Learning curve
+    ax1 = fig.add_subplot(2, 2, 1)
+    window = max(10, n_episodes // 20)
+    if len(episode_catch_rates) >= window:
+        smoothed = np.convolve(episode_catch_rates, np.ones(window)/window, mode='valid')
+        ax1.plot(episode_catch_rates, alpha=0.3, color='green', label='Raw')
+        ax1.plot(range(window-1, len(episode_catch_rates)), smoothed,
+                color='green', linewidth=2, label='Smoothed')
+    else:
+        ax1.plot(episode_catch_rates, color='green', linewidth=2)
+    ax1.set_xlabel('Episode', fontsize=12)
+    ax1.set_ylabel('Catch Rate (%)', fontsize=12)
+    ax1.set_title('🎓 AI Learning Progress', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    # Plot 2: Epsilon decay
+    ax2 = fig.add_subplot(2, 2, 2)
+    ax2.plot(epsilon_history, color='purple', linewidth=2)
+    ax2.set_xlabel('Episode', fontsize=12)
+    ax2.set_ylabel('Epsilon (Exploration Rate)', fontsize=12)
+    ax2.set_title('🔍 Explore vs Exploit Balance', fontsize=14)
+    ax2.grid(True, alpha=0.3)
+    # Add annotations
+    ax2.annotate('100% Random\n(Exploring)', xy=(0, 1), fontsize=10,
+                xytext=(n_episodes*0.1, 0.8), arrowprops=dict(arrowstyle='->', color='gray'))
+    ax2.annotate('Mostly Using\nKnowledge', xy=(n_episodes-1, epsilon_history[-1]), fontsize=10,
+                xytext=(n_episodes*0.7, 0.3), arrowprops=dict(arrowstyle='->', color='gray'))
+    # Plot 3: What AI learned vs Truth
+    ax3 = fig.add_subplot(2, 2, 3)
+    zone_values = np.zeros(10)
+    zone_counts = np.zeros(10)
+    for state, actions in agent.q_table.items():
+        for action, value in actions.items():
+            for zone in action:
+                zone_values[zone] += value
+                zone_counts[zone] += 1
+    zone_counts[zone_counts == 0] = 1
+    learned = zone_values / zone_counts
+    world = ThiefWorld(hotspot1, hotspot2)
+    truth = [world.get_thief_probability(z) for z in range(10)]
+    x = np.arange(10)
+    width = 0.35
+    ax3.bar(x - width/2, learned / max(learned.max(), 0.01), width,
+            label='AI Learned', color='blue', alpha=0.7)
+    ax3.bar(x + width/2, np.array(truth) / max(truth), width,
+            label='True Probability', color='red', alpha=0.7)
+    ax3.axvline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1 ({hotspot1})')
+    ax3.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2 ({hotspot2})')
+    ax3.set_xlabel('Zone', fontsize=12)
+    ax3.set_ylabel('Normalized Value', fontsize=12)
+    ax3.set_title('🧠 Did AI Learn the Truth?', fontsize=14)
+    ax3.legend(loc='upper right')
+    ax3.grid(True, alpha=0.3)
+    ax3.set_xticks(range(10))
+    # Plot 4: Final comparison
+    ax4 = fig.add_subplot(2, 2, 4)
+    names = list(results.keys())
+    values = list(results.values())
+    colors = ['green', 'gray', 'orange', 'blue']
+    bars = ax4.bar(names, values, color=colors, alpha=0.7, edgecolor='black')
+    for bar, val in zip(bars, values):
+        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
+                f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')
+    ax4.set_ylabel('Catch Rate (%)', fontsize=12)
+    ax4.set_title('🏆 Final Comparison', fontsize=14)
+    ax4.grid(True, alpha=0.3, axis='y')
+    plt.setp(ax4.xaxis.get_majorticklabels(), rotation=15, ha='right')
+    plt.tight_layout()
+    # Results text
+    results_text = f"""
+## 🎯 Training Complete!
+### Training Summary:
+- Episodes trained: **{n_episodes}**
+- Hotspot 1: Zone **{hotspot1}**
+- Hotspot 2: Zone **{hotspot2}**
+- Final exploration rate: **{epsilon_history[-1]*100:.1f}%**
+### 📊 Test Results (50 test runs each):
+| Strategy | Catch Rate |
+|----------|------------|
+| 🏆 **Q-Learning AI** | **{results['Q-Learning AI']:.1f}%** |
+| Random | {results['Random']:.1f}% |
+| Static Uniform | {results['Static Uniform']:.1f}% |
+| Perfect (Cheating) | {results['Perfect (Cheating)']:.1f}% |
+### 🧠 What AI Learned:
+The AI discovered that zones **{int(hotspot1)}** and **{int(hotspot2)}** have more thieves!
+### 🎓 Key Insight:
+AI started knowing **NOTHING** and learned through **trial and error**!
+"""
+    return fig, results_text
+def explain_qlearning():
+    """Create explanation visualization."""
+    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+    # Plot 1: Q-Learning cycle
+    ax1 = axes[0]
+    ax1.axis('off')
+    # Draw cycle
+    cycle_text = """
+    ┌─────────────────────────────────────────────────────────────┐
+    │                    Q-LEARNING CYCLE                         │
+    │                                                             │
+    │                      ┌─────────┐                            │
+    │                      │  STATE  │                            │
+    │                      │(What AI │                            │
+    │                      │  sees)  │                            │
+    │                      └────┬────┘                            │
+    │                           │                                 │
+    │                           ▼                                 │
+    │    ┌──────────┐     ┌─────────┐     ┌──────────┐           │
+    │    │  UPDATE  │◄────│ ACTION  │────►│  REWARD  │           │
+    │    │ Q-TABLE  │     │(Place   │     │(Caught   │           │
+    │    │(Remember)│     │sensors) │     │thieves?) │           │
+    │    └──────────┘     └─────────┘     └──────────┘           │
+    │         │                                 │                 │
+    │         └─────────────────────────────────┘                 │
+    │                     REPEAT!                                 │
+    └─────────────────────────────────────────────────────────────┘
+    """
+    ax1.text(0.5, 0.5, cycle_text, transform=ax1.transAxes, fontsize=10,
+             verticalalignment='center', horizontalalignment='center',
+             fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='lightyellow'))
+    ax1.set_title('How Q-Learning Works', fontsize=14)
+    # Plot 2: Epsilon explanation
+    ax2 = axes[1]
+    episodes = np.arange(500)
+    epsilon = 1.0 * (0.995 ** episodes)
+    epsilon = np.maximum(epsilon, 0.01)
+    ax2.fill_between(episodes, epsilon, alpha=0.3, color='blue', label='EXPLORE')
+    ax2.fill_between(episodes, 0, 1-epsilon, alpha=0.3, color='green', label='EXPLOIT')
+    ax2.plot(episodes, epsilon, 'b-', linewidth=2)
+    ax2.plot(episodes, 1-epsilon, 'g-', linewidth=2)
+    ax2.axvline(50, color='gray', linestyle='--', alpha=0.5)
+    ax2.axvline(200, color='gray', linestyle='--', alpha=0.5)
+    ax2.axvline(400, color='gray', linestyle='--', alpha=0.5)
+    ax2.text(25, 0.5, 'Early:\n80% Explore', fontsize=9, ha='center')
+    ax2.text(125, 0.5, 'Middle:\n50-50', fontsize=9, ha='center')
+    ax2.text(300, 0.5, 'Late:\n80% Exploit', fontsize=9, ha='center')
+    ax2.set_xlabel('Episode', fontsize=12)
+    ax2.set_ylabel('Probability', fontsize=12)
+    ax2.set_title('Explore vs Exploit Over Time', fontsize=14)
+    ax2.legend(loc='center right')
+    ax2.grid(True, alpha=0.3)
+    plt.tight_layout()
+    return fig
+def show_environment(hotspot1, hotspot2):
+    """Visualize the thief world."""
+    fig, ax = plt.subplots(figsize=(12, 5))
+    world = ThiefWorld(hotspot1, hotspot2)
+    zones = np.arange(10)
+    probs = [world.get_thief_probability(z) for z in zones]
+    colors = ['red' if p > 0.4 else 'orange' if p > 0.2 else 'green' for p in probs]
+    bars = ax.bar(zones, probs, color=colors, alpha=0.7, edgecolor='black')
+    for bar, prob in zip(bars, probs):
+        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
+               f'{prob*100:.0f}%', ha='center', fontsize=10, fontweight='bold')
+    ax.axvline(hotspot1, color='red', linestyle='--', linewidth=2, label=f'Hotspot 1 ({hotspot1})')
+    ax.axvline(hotspot2, color='darkred', linestyle='--', linewidth=2, label=f'Hotspot 2 ({hotspot2})')
+    ax.set_xlabel('Zone', fontsize=12)
+    ax.set_ylabel('Thief Probability', fontsize=12)
+    ax.set_title('🦹 Secret Thief Locations (AI Must Discover This!)', fontsize=14)
+    ax.set_xticks(zones)
+    ax.legend()
+    ax.grid(True, alpha=0.3, axis='y')
+    plt.tight_layout()
+    return fig
+def simulate_one_episode(hotspot1, hotspot2):
+    """Simulate and visualize one episode."""
+    np.random.seed(None)  # Random seed for variety
+    env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
+    agent = QLearningAgent()
+    agent.epsilon = 0.5  # 50% explore for demo
+    state = env.reset()
+    # Track daily data
+    daily_actions = []
+    daily_caught = []
+    daily_thieves = []
+    for day in range(30):
+        action = agent.choose_action(state)
+        daily_actions.append(action)
+        old_caught = env.total_caught
+        old_thieves = env.total_thieves
+        state, reward, done, info = env.step(action)
+        daily_caught.append(env.total_caught - old_caught)
+        daily_thieves.append(env.total_thieves - old_thieves)
+        agent.learn(state, action, reward, state, done)
+    # Create visualization
+    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+    # Plot 1: Sensor placements over days
+    ax1 = axes[0, 0]
+    for day, action in enumerate(daily_actions):
+        for zone in action:
+            ax1.scatter(day, zone, c='blue', s=30, alpha=0.6)
+    ax1.axhline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1')
+    ax1.axhline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2')
+    ax1.set_xlabel('Day', fontsize=12)
+    ax1.set_ylabel('Zone', fontsize=12)
+    ax1.set_title('📍 Where AI Placed Sensors Each Day', fontsize=14)
+    ax1.legend()
+    ax1.grid(True, alpha=0.3)
+    ax1.set_yticks(range(10))
+    # Plot 2: Daily catches
+    ax2 = axes[0, 1]
+    days = range(1, 31)
+    ax2.bar(days, daily_caught, color='green', alpha=0.7, label='Caught')
+    ax2.plot(days, daily_thieves, 'ro-', markersize=5, label='Total Thieves')
+    ax2.set_xlabel('Day', fontsize=12)
+    ax2.set_ylabel('Count', fontsize=12)
+    ax2.set_title('🎯 Daily Catches', fontsize=14)
+    ax2.legend()
+    ax2.grid(True, alpha=0.3)
+    # Plot 3: Cumulative performance
+    ax3 = axes[1, 0]
+    cum_caught = np.cumsum(daily_caught)
+    cum_thieves = np.cumsum(daily_thieves)
+    ax3.fill_between(days, cum_caught, alpha=0.3, color='green')
+    ax3.plot(days, cum_caught, 'g-', linewidth=2, label='Cumulative Caught')
+    ax3.plot(days, cum_thieves, 'r--', linewidth=2, label='Cumulative Thieves')
+    ax3.set_xlabel('Day', fontsize=12)
+    ax3.set_ylabel('Cumulative Count', fontsize=12)
+    ax3.set_title('📈 Cumulative Performance', fontsize=14)
+    ax3.legend()
+    ax3.grid(True, alpha=0.3)
+    # Plot 4: Zone usage
+    ax4 = axes[1, 1]
+    zone_usage = np.zeros(10)
+    for action in daily_actions:
+        for zone in action:
+            zone_usage[zone] += 1
+    colors = ['blue' if z in [int(hotspot1), int(hotspot1)+1, int(hotspot2), int(hotspot2)+1]
+              else 'gray' for z in range(10)]
+    ax4.bar(range(10), zone_usage, color=colors, alpha=0.7, edgecolor='black')
+    ax4.axvline(hotspot1, color='red', linestyle='--', alpha=0.5)
+    ax4.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5)
+    ax4.set_xlabel('Zone', fontsize=12)
+    ax4.set_ylabel('Times Used', fontsize=12)
+    ax4.set_title('🗺️ Zone Usage (Blue = Near Hotspots)', fontsize=14)
+    ax4.set_xticks(range(10))
+    ax4.grid(True, alpha=0.3, axis='y')
+    plt.tight_layout()
+    # Summary
+    catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
+    summary = f"""
+## 📊 Episode Summary
+- **Total Thieves:** {env.total_thieves}
+- **Total Caught:** {env.total_caught}
+- **Catch Rate:** {catch_rate:.1f}%
+### Zones Most Used:
+{', '.join([f'Zone {i}' for i in np.argsort(zone_usage)[-3:][::-1]])}
+### Note:
+This is just ONE episode with 50% exploration.
+Train for 500+ episodes to see real learning!
+"""
+    return fig, summary
+# ==============================================================================
+# GRADIO INTERFACE
+# ==============================================================================
+with gr.Blocks(title="Q-Learning AI Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🤖 Q-Learning AI for Sensor Placement
+    **Watch an AI learn where to place sensors to catch thieves!**
+    The AI starts knowing NOTHING and learns through trial-and-error.
+    ---
+    """)
+    with gr.Tabs():
+        # ==== TAB 1: Explanation ====
+        with gr.TabItem("1️⃣ What is Q-Learning?"):
+            gr.Markdown("""
+            ## 🎓 Q-Learning Explained Simply
+            ### Like Teaching a Dog:
+            ```
+            1. Dog tries something → 2. Gets treat (or not) → 3. Remembers → 4. Gets smarter!
+            ```
+            ### For Our AI:
+            ```
+            1. AI places sensors → 2. Catches thieves (reward!) → 3. Updates Q-Table → 4. Gets smarter!
+            ```
+            ### The Q-Table (AI's Memory):
+            | State | Action | Expected Reward |
+            |-------|--------|-----------------|
+            | "Day 1" | Zones (1,3,6,8) | 1.5 points |
+            | "Day 1" | Zones (2,3,7,8) | 3.2 points ← Better! |
+            ### Explore vs Exploit:
+            - **EXPLORE**: Try random things to learn
+            - **EXPLOIT**: Use what you already know
+            Early training → More EXPLORE
+            Late training → More EXPLOIT
+            """)
+            explain_btn = gr.Button("📊 Show Visual Explanation", variant="primary")
+            explain_plot = gr.Plot()
+            explain_btn.click(explain_qlearning, outputs=explain_plot)
+        # ==== TAB 2: Environment ====
+        with gr.TabItem("2️⃣ The Secret World"):
+            gr.Markdown("""
+            ## 🦹 Where Do Thieves Appear?
+            The AI doesn't know this! It must DISCOVER it through learning.
+            Adjust the hotspot locations and see the thief distribution:
+            """)
+            with gr.Row():
+                h1_slider = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1 Location")
+                h2_slider = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2 Location")
+            env_btn = gr.Button("🗺️ Show Thief Distribution", variant="primary")
+            env_plot = gr.Plot()
+            env_btn.click(show_environment, [h1_slider, h2_slider], env_plot)
+        # ==== TAB 3: One Episode ====
+        with gr.TabItem("3️⃣ Watch One Episode"):
+            gr.Markdown("""
+            ## 👀 See One Month (30 Days) of Simulation
+            Watch how AI makes decisions and catches thieves.
+            (Note: This is untrained AI with 50% exploration rate)
+            """)
+            with gr.Row():
+                h1_ep = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
+                h2_ep = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")
+            ep_btn = gr.Button("▶️ Run One Episode", variant="primary")
+            ep_plot = gr.Plot()
+            ep_summary = gr.Markdown()
+            ep_btn.click(simulate_one_episode, [h1_ep, h2_ep], [ep_plot, ep_summary])
+        # ==== TAB 4: Full Training ====
+        with gr.TabItem("4️⃣ Train the AI!"):
+            gr.Markdown("""
+            ## 🏋️ Train Q-Learning AI
+            Train the AI and compare it against other strategies!
+            ⚠️ Training takes a few seconds depending on episodes.
+            """)
+            with gr.Row():
+                episodes_slider = gr.Slider(100, 1000, value=300, step=50,
+                                           label="Number of Episodes")
+            with gr.Row():
+                h1_train = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
+                h2_train = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")
+            train_btn = gr.Button("🚀 Train AI!", variant="primary", size="lg")
+            train_plot = gr.Plot()
+            train_results = gr.Markdown()
+            train_btn.click(train_and_test,
+                           [episodes_slider, h1_train, h2_train],
+                           [train_plot, train_results])
+        # ==== TAB 5: Summary ====
+        with gr.TabItem("5️⃣ Key Concepts"):
+            gr.Markdown("""
+            ## 📚 Summary: Q-Learning Key Concepts
+            ### 1. Q-Table
+            ```
+            A "cheat sheet" that stores:
+            "In STATE X, if I do ACTION Y, I expect REWARD Z"
+            ```
+            ### 2. State
+            ```
+            What the AI "sees" at any moment.
+            Example: (most_tried_zone, best_zone_so_far)
+            ```
+            ### 3. Action
+            ```
+            What the AI can do.
+            Example: Place sensors in zones (2, 3, 7, 8)
+            ```
+            ### 4. Reward
+            ```
+            Points for good actions.
+            Example: +1 for each thief caught
+            ```
+            ### 5. Epsilon (ε)
+            ```
+            Exploration rate.
+            ε = 1.0 → 100% random (exploring)
+            ε = 0.01 → 1% random (exploiting knowledge)
+            ```
+            ### 6. Learning Formula
+            ```
+            Q(s,a) = Q(s,a) + α × (reward + γ × max(Q(s',a')) - Q(s,a))
+            In simple terms:
+            New Memory = Old Memory + Learning Rate × (Reality - Expectation)
+            ```
+            ---
+            ## 🎯 Why This Matters
+            This same technique is used in:
+            - 🎮 Game AI (AlphaGo, Chess engines)
+            - 🚗 Self-driving cars
+            - 🤖 Robots
+            - 📱 Recommendation systems
+            **You just learned how real AI works!** 🎓
+            """)
+    gr.Markdown("""
+    ---
+    ### 🔗 About
+    This demo shows **Q-Learning Reinforcement Learning** for sensor placement.
+    The AI learns through trial-and-error, just like humans!
+    """)
+# Launch
+if __name__ == "__main__":
+    demo.launch()