""" Q-Learning AI for Sensor Placement - Interactive Demo For Hugging Face Spaces """ import numpy as np import matplotlib.pyplot as plt import gradio as gr from collections import defaultdict np.random.seed(42) # ============================================================================== # PART 1: THE SECRET WORLD # ============================================================================== class ThiefWorld: """Where thieves REALLY appear (AI must discover this!)""" def __init__(self, hotspot1=2.5, hotspot2=7.0): self.hotspot1 = hotspot1 self.hotspot2 = hotspot2 self.n_zones = 10 def get_thief_probability(self, zone): zone_center = zone + 0.5 prob = ( 0.6 * np.exp(-((zone_center - self.hotspot1)**2) / 1.0) + 0.4 * np.exp(-((zone_center - self.hotspot2)**2) / 0.8) + 0.05 ) return min(prob, 1.0) def generate_thieves(self): thieves = np.zeros(self.n_zones) for zone in range(self.n_zones): if np.random.random() < self.get_thief_probability(zone): thieves[zone] = 1 return thieves # ============================================================================== # PART 2: SENSOR # ============================================================================== class Sensor: def __init__(self, catch_probability=0.9): self.catch_prob = catch_probability def try_catch(self, thief_present): if thief_present: return np.random.random() < self.catch_prob return False # ============================================================================== # PART 3: ENVIRONMENT # ============================================================================== class SensorPlacementEnv: def __init__(self, n_sensors=4, hotspot1=2.5, hotspot2=7.0): self.world = ThiefWorld(hotspot1, hotspot2) self.sensor = Sensor() self.n_sensors = n_sensors self.n_zones = 10 self.reset() def reset(self): self.zone_attempts = np.zeros(self.n_zones) self.zone_catches = np.zeros(self.n_zones) self.day = 0 self.total_caught = 0 self.total_thieves = 0 return self._get_state() def _get_state(self): if self.zone_attempts.sum() == 0: return (0, 0) most_tried = int(np.argmax(self.zone_attempts)) catch_rates = np.zeros(self.n_zones) for z in range(self.n_zones): if self.zone_attempts[z] > 0: catch_rates[z] = self.zone_catches[z] / self.zone_attempts[z] best_zone = int(np.argmax(catch_rates)) return (most_tried, best_zone) def step(self, action): thieves = self.world.generate_thieves() n_thieves = int(thieves.sum()) self.total_thieves += n_thieves caught = 0 for zone in action: if zone < self.n_zones: self.zone_attempts[zone] += 1 if thieves[zone] == 1: if self.sensor.try_catch(True): caught += 1 self.zone_catches[zone] += 1 self.total_caught += caught self.day += 1 reward = caught + 0.1 * len(set(action)) done = self.day >= 30 return self._get_state(), reward, done, {'caught': caught} # ============================================================================== # PART 4: Q-LEARNING AGENT # ============================================================================== class QLearningAgent: def __init__(self): self.q_table = defaultdict(lambda: defaultdict(float)) self.learning_rate = 0.1 self.discount_factor = 0.95 self.epsilon = 1.0 self.epsilon_decay = 0.995 self.epsilon_min = 0.01 def _get_possible_actions(self): return [ (1, 3, 6, 8), (0, 3, 6, 9), (2, 4, 6, 8), (0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5), (5, 6, 7, 8), (6, 7, 8, 9), (4, 5, 6, 7), (2, 3, 7, 8), (1, 2, 6, 7), (2, 3, 6, 7), (3, 4, 5, 6), (0, 2, 5, 9), (1, 4, 7, 9), ] def choose_action(self, state): actions = self._get_possible_actions() if np.random.random() < self.epsilon: return actions[np.random.randint(len(actions))] else: best_action = None best_value = -999999 for action in actions: value = self.q_table[state][action] if value > best_value: best_value = value best_action = action if best_action is None: best_action = actions[np.random.randint(len(actions))] return best_action def learn(self, state, action, reward, next_state, done): old_q = self.q_table[state][action] if done: max_future_q = 0 else: actions = self._get_possible_actions() max_future_q = max([self.q_table[next_state][a] for a in actions]) target = reward + self.discount_factor * max_future_q new_q = old_q + self.learning_rate * (target - old_q) self.q_table[state][action] = new_q def decay_epsilon(self): self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) # ============================================================================== # TRAINING AND TESTING FUNCTIONS # ============================================================================== def train_and_test(n_episodes, hotspot1, hotspot2, progress=gr.Progress()): """Train AI and compare with other strategies.""" np.random.seed(42) # Training env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2) agent = QLearningAgent() episode_rewards = [] episode_catch_rates = [] epsilon_history = [] for episode in progress.tqdm(range(n_episodes), desc="Training AI"): state = env.reset() total_reward = 0 for day in range(30): action = agent.choose_action(state) next_state, reward, done, _ = env.step(action) agent.learn(state, action, reward, next_state, done) state = next_state total_reward += reward if done: break agent.decay_epsilon() episode_rewards.append(total_reward) catch_rate = env.total_caught / max(env.total_thieves, 1) * 100 episode_catch_rates.append(catch_rate) epsilon_history.append(agent.epsilon) # Testing n_tests = 50 results = {} # Q-Learning AI agent.epsilon = 0 catches = [] for _ in range(n_tests): state = env.reset() for day in range(30): action = agent.choose_action(state) state, _, done, _ = env.step(action) if done: break catches.append(env.total_caught / max(env.total_thieves, 1) * 100) results['Q-Learning AI'] = np.mean(catches) # Random catches = [] for _ in range(n_tests): env.reset() for day in range(30): action = tuple(np.random.choice(10, 4, replace=False)) _, _, done, _ = env.step(action) if done: break catches.append(env.total_caught / max(env.total_thieves, 1) * 100) results['Random'] = np.mean(catches) # Static catches = [] for _ in range(n_tests): env.reset() for day in range(30): _, _, done, _ = env.step((1, 3, 6, 8)) if done: break catches.append(env.total_caught / max(env.total_thieves, 1) * 100) results['Static Uniform'] = np.mean(catches) # Perfect h1_zone = int(hotspot1) h2_zone = int(hotspot2) perfect_action = (h1_zone, h1_zone+1, h2_zone, h2_zone+1) perfect_action = tuple(min(z, 9) for z in perfect_action) catches = [] for _ in range(n_tests): env.reset() for day in range(30): _, _, done, _ = env.step(perfect_action) if done: break catches.append(env.total_caught / max(env.total_thieves, 1) * 100) results['Perfect (Cheating)'] = np.mean(catches) # Create plots fig = plt.figure(figsize=(16, 12)) # Plot 1: Learning curve ax1 = fig.add_subplot(2, 2, 1) window = max(10, n_episodes // 20) if len(episode_catch_rates) >= window: smoothed = np.convolve(episode_catch_rates, np.ones(window)/window, mode='valid') ax1.plot(episode_catch_rates, alpha=0.3, color='green', label='Raw') ax1.plot(range(window-1, len(episode_catch_rates)), smoothed, color='green', linewidth=2, label='Smoothed') else: ax1.plot(episode_catch_rates, color='green', linewidth=2) ax1.set_xlabel('Episode', fontsize=12) ax1.set_ylabel('Catch Rate (%)', fontsize=12) ax1.set_title('๐ŸŽ“ AI Learning Progress', fontsize=14) ax1.legend() ax1.grid(True, alpha=0.3) # Plot 2: Epsilon decay ax2 = fig.add_subplot(2, 2, 2) ax2.plot(epsilon_history, color='purple', linewidth=2) ax2.set_xlabel('Episode', fontsize=12) ax2.set_ylabel('Epsilon (Exploration Rate)', fontsize=12) ax2.set_title('๐Ÿ” Explore vs Exploit Balance', fontsize=14) ax2.grid(True, alpha=0.3) # Add annotations ax2.annotate('100% Random\n(Exploring)', xy=(0, 1), fontsize=10, xytext=(n_episodes*0.1, 0.8), arrowprops=dict(arrowstyle='->', color='gray')) ax2.annotate('Mostly Using\nKnowledge', xy=(n_episodes-1, epsilon_history[-1]), fontsize=10, xytext=(n_episodes*0.7, 0.3), arrowprops=dict(arrowstyle='->', color='gray')) # Plot 3: What AI learned vs Truth ax3 = fig.add_subplot(2, 2, 3) zone_values = np.zeros(10) zone_counts = np.zeros(10) for state, actions in agent.q_table.items(): for action, value in actions.items(): for zone in action: zone_values[zone] += value zone_counts[zone] += 1 zone_counts[zone_counts == 0] = 1 learned = zone_values / zone_counts world = ThiefWorld(hotspot1, hotspot2) truth = [world.get_thief_probability(z) for z in range(10)] x = np.arange(10) width = 0.35 ax3.bar(x - width/2, learned / max(learned.max(), 0.01), width, label='AI Learned', color='blue', alpha=0.7) ax3.bar(x + width/2, np.array(truth) / max(truth), width, label='True Probability', color='red', alpha=0.7) ax3.axvline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1 ({hotspot1})') ax3.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2 ({hotspot2})') ax3.set_xlabel('Zone', fontsize=12) ax3.set_ylabel('Normalized Value', fontsize=12) ax3.set_title('๐Ÿง  Did AI Learn the Truth?', fontsize=14) ax3.legend(loc='upper right') ax3.grid(True, alpha=0.3) ax3.set_xticks(range(10)) # Plot 4: Final comparison ax4 = fig.add_subplot(2, 2, 4) names = list(results.keys()) values = list(results.values()) colors = ['green', 'gray', 'orange', 'blue'] bars = ax4.bar(names, values, color=colors, alpha=0.7, edgecolor='black') for bar, val in zip(bars, values): ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold') ax4.set_ylabel('Catch Rate (%)', fontsize=12) ax4.set_title('๐Ÿ† Final Comparison', fontsize=14) ax4.grid(True, alpha=0.3, axis='y') plt.setp(ax4.xaxis.get_majorticklabels(), rotation=15, ha='right') plt.tight_layout() # Results text results_text = f""" ## ๐ŸŽฏ Training Complete! ### Training Summary: - Episodes trained: **{n_episodes}** - Hotspot 1: Zone **{hotspot1}** - Hotspot 2: Zone **{hotspot2}** - Final exploration rate: **{epsilon_history[-1]*100:.1f}%** ### ๐Ÿ“Š Test Results (50 test runs each): | Strategy | Catch Rate | |----------|------------| | ๐Ÿ† **Q-Learning AI** | **{results['Q-Learning AI']:.1f}%** | | Random | {results['Random']:.1f}% | | Static Uniform | {results['Static Uniform']:.1f}% | | Perfect (Cheating) | {results['Perfect (Cheating)']:.1f}% | ### ๐Ÿง  What AI Learned: The AI discovered that zones **{int(hotspot1)}** and **{int(hotspot2)}** have more thieves! ### ๐ŸŽ“ Key Insight: AI started knowing **NOTHING** and learned through **trial and error**! """ return fig, results_text def explain_qlearning(): """Create explanation visualization.""" fig, axes = plt.subplots(1, 2, figsize=(14, 5)) # Plot 1: Q-Learning cycle ax1 = axes[0] ax1.axis('off') # Draw cycle cycle_text = """ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ Q-LEARNING CYCLE โ”‚ โ”‚ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ โ”‚ STATE โ”‚ โ”‚ โ”‚ โ”‚(What AI โ”‚ โ”‚ โ”‚ โ”‚ sees) โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ–ผ โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ โ”‚ UPDATE โ”‚โ—„โ”€โ”€โ”€โ”€โ”‚ ACTION โ”‚โ”€โ”€โ”€โ”€โ–บโ”‚ REWARD โ”‚ โ”‚ โ”‚ โ”‚ Q-TABLE โ”‚ โ”‚(Place โ”‚ โ”‚(Caught โ”‚ โ”‚ โ”‚ โ”‚(Remember)โ”‚ โ”‚sensors) โ”‚ โ”‚thieves?) โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ REPEAT! โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ """ ax1.text(0.5, 0.5, cycle_text, transform=ax1.transAxes, fontsize=10, verticalalignment='center', horizontalalignment='center', fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='lightyellow')) ax1.set_title('How Q-Learning Works', fontsize=14) # Plot 2: Epsilon explanation ax2 = axes[1] episodes = np.arange(500) epsilon = 1.0 * (0.995 ** episodes) epsilon = np.maximum(epsilon, 0.01) ax2.fill_between(episodes, epsilon, alpha=0.3, color='blue', label='EXPLORE') ax2.fill_between(episodes, 0, 1-epsilon, alpha=0.3, color='green', label='EXPLOIT') ax2.plot(episodes, epsilon, 'b-', linewidth=2) ax2.plot(episodes, 1-epsilon, 'g-', linewidth=2) ax2.axvline(50, color='gray', linestyle='--', alpha=0.5) ax2.axvline(200, color='gray', linestyle='--', alpha=0.5) ax2.axvline(400, color='gray', linestyle='--', alpha=0.5) ax2.text(25, 0.5, 'Early:\n80% Explore', fontsize=9, ha='center') ax2.text(125, 0.5, 'Middle:\n50-50', fontsize=9, ha='center') ax2.text(300, 0.5, 'Late:\n80% Exploit', fontsize=9, ha='center') ax2.set_xlabel('Episode', fontsize=12) ax2.set_ylabel('Probability', fontsize=12) ax2.set_title('Explore vs Exploit Over Time', fontsize=14) ax2.legend(loc='center right') ax2.grid(True, alpha=0.3) plt.tight_layout() return fig def show_environment(hotspot1, hotspot2): """Visualize the thief world.""" fig, ax = plt.subplots(figsize=(12, 5)) world = ThiefWorld(hotspot1, hotspot2) zones = np.arange(10) probs = [world.get_thief_probability(z) for z in zones] colors = ['red' if p > 0.4 else 'orange' if p > 0.2 else 'green' for p in probs] bars = ax.bar(zones, probs, color=colors, alpha=0.7, edgecolor='black') for bar, prob in zip(bars, probs): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, f'{prob*100:.0f}%', ha='center', fontsize=10, fontweight='bold') ax.axvline(hotspot1, color='red', linestyle='--', linewidth=2, label=f'Hotspot 1 ({hotspot1})') ax.axvline(hotspot2, color='darkred', linestyle='--', linewidth=2, label=f'Hotspot 2 ({hotspot2})') ax.set_xlabel('Zone', fontsize=12) ax.set_ylabel('Thief Probability', fontsize=12) ax.set_title('๐Ÿฆน Secret Thief Locations (AI Must Discover This!)', fontsize=14) ax.set_xticks(zones) ax.legend() ax.grid(True, alpha=0.3, axis='y') plt.tight_layout() return fig def simulate_one_episode(hotspot1, hotspot2): """Simulate and visualize one episode.""" np.random.seed(None) # Random seed for variety env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2) agent = QLearningAgent() agent.epsilon = 0.5 # 50% explore for demo state = env.reset() # Track daily data daily_actions = [] daily_caught = [] daily_thieves = [] for day in range(30): action = agent.choose_action(state) daily_actions.append(action) old_caught = env.total_caught old_thieves = env.total_thieves state, reward, done, info = env.step(action) daily_caught.append(env.total_caught - old_caught) daily_thieves.append(env.total_thieves - old_thieves) agent.learn(state, action, reward, state, done) # Create visualization fig, axes = plt.subplots(2, 2, figsize=(14, 10)) # Plot 1: Sensor placements over days ax1 = axes[0, 0] for day, action in enumerate(daily_actions): for zone in action: ax1.scatter(day, zone, c='blue', s=30, alpha=0.6) ax1.axhline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1') ax1.axhline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2') ax1.set_xlabel('Day', fontsize=12) ax1.set_ylabel('Zone', fontsize=12) ax1.set_title('๐Ÿ“ Where AI Placed Sensors Each Day', fontsize=14) ax1.legend() ax1.grid(True, alpha=0.3) ax1.set_yticks(range(10)) # Plot 2: Daily catches ax2 = axes[0, 1] days = range(1, 31) ax2.bar(days, daily_caught, color='green', alpha=0.7, label='Caught') ax2.plot(days, daily_thieves, 'ro-', markersize=5, label='Total Thieves') ax2.set_xlabel('Day', fontsize=12) ax2.set_ylabel('Count', fontsize=12) ax2.set_title('๐ŸŽฏ Daily Catches', fontsize=14) ax2.legend() ax2.grid(True, alpha=0.3) # Plot 3: Cumulative performance ax3 = axes[1, 0] cum_caught = np.cumsum(daily_caught) cum_thieves = np.cumsum(daily_thieves) ax3.fill_between(days, cum_caught, alpha=0.3, color='green') ax3.plot(days, cum_caught, 'g-', linewidth=2, label='Cumulative Caught') ax3.plot(days, cum_thieves, 'r--', linewidth=2, label='Cumulative Thieves') ax3.set_xlabel('Day', fontsize=12) ax3.set_ylabel('Cumulative Count', fontsize=12) ax3.set_title('๐Ÿ“ˆ Cumulative Performance', fontsize=14) ax3.legend() ax3.grid(True, alpha=0.3) # Plot 4: Zone usage ax4 = axes[1, 1] zone_usage = np.zeros(10) for action in daily_actions: for zone in action: zone_usage[zone] += 1 colors = ['blue' if z in [int(hotspot1), int(hotspot1)+1, int(hotspot2), int(hotspot2)+1] else 'gray' for z in range(10)] ax4.bar(range(10), zone_usage, color=colors, alpha=0.7, edgecolor='black') ax4.axvline(hotspot1, color='red', linestyle='--', alpha=0.5) ax4.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5) ax4.set_xlabel('Zone', fontsize=12) ax4.set_ylabel('Times Used', fontsize=12) ax4.set_title('๐Ÿ—บ๏ธ Zone Usage (Blue = Near Hotspots)', fontsize=14) ax4.set_xticks(range(10)) ax4.grid(True, alpha=0.3, axis='y') plt.tight_layout() # Summary catch_rate = env.total_caught / max(env.total_thieves, 1) * 100 summary = f""" ## ๐Ÿ“Š Episode Summary - **Total Thieves:** {env.total_thieves} - **Total Caught:** {env.total_caught} - **Catch Rate:** {catch_rate:.1f}% ### Zones Most Used: {', '.join([f'Zone {i}' for i in np.argsort(zone_usage)[-3:][::-1]])} ### Note: This is just ONE episode with 50% exploration. Train for 500+ episodes to see real learning! """ return fig, summary # ============================================================================== # GRADIO INTERFACE # ============================================================================== with gr.Blocks(title="Q-Learning AI Demo", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # ๐Ÿค– Q-Learning AI for Sensor Placement **Watch an AI learn where to place sensors to catch thieves!** The AI starts knowing NOTHING and learns through trial-and-error. --- """) with gr.Tabs(): # ==== TAB 1: Explanation ==== with gr.TabItem("1๏ธโƒฃ What is Q-Learning?"): gr.Markdown(""" ## ๐ŸŽ“ Q-Learning Explained Simply ### Like Teaching a Dog: ``` 1. Dog tries something โ†’ 2. Gets treat (or not) โ†’ 3. Remembers โ†’ 4. Gets smarter! ``` ### For Our AI: ``` 1. AI places sensors โ†’ 2. Catches thieves (reward!) โ†’ 3. Updates Q-Table โ†’ 4. Gets smarter! ``` ### The Q-Table (AI's Memory): | State | Action | Expected Reward | |-------|--------|-----------------| | "Day 1" | Zones (1,3,6,8) | 1.5 points | | "Day 1" | Zones (2,3,7,8) | 3.2 points โ† Better! | ### Explore vs Exploit: - **EXPLORE**: Try random things to learn - **EXPLOIT**: Use what you already know Early training โ†’ More EXPLORE Late training โ†’ More EXPLOIT """) explain_btn = gr.Button("๐Ÿ“Š Show Visual Explanation", variant="primary") explain_plot = gr.Plot() explain_btn.click(explain_qlearning, outputs=explain_plot) # ==== TAB 2: Environment ==== with gr.TabItem("2๏ธโƒฃ The Secret World"): gr.Markdown(""" ## ๐Ÿฆน Where Do Thieves Appear? The AI doesn't know this! It must DISCOVER it through learning. Adjust the hotspot locations and see the thief distribution: """) with gr.Row(): h1_slider = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1 Location") h2_slider = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2 Location") env_btn = gr.Button("๐Ÿ—บ๏ธ Show Thief Distribution", variant="primary") env_plot = gr.Plot() env_btn.click(show_environment, [h1_slider, h2_slider], env_plot) # ==== TAB 3: One Episode ==== with gr.TabItem("3๏ธโƒฃ Watch One Episode"): gr.Markdown(""" ## ๐Ÿ‘€ See One Month (30 Days) of Simulation Watch how AI makes decisions and catches thieves. (Note: This is untrained AI with 50% exploration rate) """) with gr.Row(): h1_ep = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1") h2_ep = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2") ep_btn = gr.Button("โ–ถ๏ธ Run One Episode", variant="primary") ep_plot = gr.Plot() ep_summary = gr.Markdown() ep_btn.click(simulate_one_episode, [h1_ep, h2_ep], [ep_plot, ep_summary]) # ==== TAB 4: Full Training ==== with gr.TabItem("4๏ธโƒฃ Train the AI!"): gr.Markdown(""" ## ๐Ÿ‹๏ธ Train Q-Learning AI Train the AI and compare it against other strategies! โš ๏ธ Training takes a few seconds depending on episodes. """) with gr.Row(): episodes_slider = gr.Slider(100, 1000, value=300, step=50, label="Number of Episodes") with gr.Row(): h1_train = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1") h2_train = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2") train_btn = gr.Button("๐Ÿš€ Train AI!", variant="primary", size="lg") train_plot = gr.Plot() train_results = gr.Markdown() train_btn.click(train_and_test, [episodes_slider, h1_train, h2_train], [train_plot, train_results]) # ==== TAB 5: Summary ==== with gr.TabItem("5๏ธโƒฃ Key Concepts"): gr.Markdown(""" ## ๐Ÿ“š Summary: Q-Learning Key Concepts ### 1. Q-Table ``` A "cheat sheet" that stores: "In STATE X, if I do ACTION Y, I expect REWARD Z" ``` ### 2. State ``` What the AI "sees" at any moment. Example: (most_tried_zone, best_zone_so_far) ``` ### 3. Action ``` What the AI can do. Example: Place sensors in zones (2, 3, 7, 8) ``` ### 4. Reward ``` Points for good actions. Example: +1 for each thief caught ``` ### 5. Epsilon (ฮต) ``` Exploration rate. ฮต = 1.0 โ†’ 100% random (exploring) ฮต = 0.01 โ†’ 1% random (exploiting knowledge) ``` ### 6. Learning Formula ``` Q(s,a) = Q(s,a) + ฮฑ ร— (reward + ฮณ ร— max(Q(s',a')) - Q(s,a)) In simple terms: New Memory = Old Memory + Learning Rate ร— (Reality - Expectation) ``` --- ## ๐ŸŽฏ Why This Matters This same technique is used in: - ๐ŸŽฎ Game AI (AlphaGo, Chess engines) - ๐Ÿš— Self-driving cars - ๐Ÿค– Robots - ๐Ÿ“ฑ Recommendation systems **You just learned how real AI works!** ๐ŸŽ“ """) gr.Markdown(""" --- ### ๐Ÿ”— About This demo shows **Q-Learning Reinforcement Learning** for sensor placement. The AI learns through trial-and-error, just like humans! """) # Launch if __name__ == "__main__": demo.launch()