Spaces:
Sleeping
Sleeping
| """ | |
| Q-Learning AI for Sensor Placement - Interactive Demo | |
| For Hugging Face Spaces | |
| """ | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import gradio as gr | |
| from collections import defaultdict | |
| np.random.seed(42) | |
| # ============================================================================== | |
| # PART 1: THE SECRET WORLD | |
| # ============================================================================== | |
| class ThiefWorld: | |
| """Where thieves REALLY appear (AI must discover this!)""" | |
| def __init__(self, hotspot1=2.5, hotspot2=7.0): | |
| self.hotspot1 = hotspot1 | |
| self.hotspot2 = hotspot2 | |
| self.n_zones = 10 | |
| def get_thief_probability(self, zone): | |
| zone_center = zone + 0.5 | |
| prob = ( | |
| 0.6 * np.exp(-((zone_center - self.hotspot1)**2) / 1.0) + | |
| 0.4 * np.exp(-((zone_center - self.hotspot2)**2) / 0.8) + | |
| 0.05 | |
| ) | |
| return min(prob, 1.0) | |
| def generate_thieves(self): | |
| thieves = np.zeros(self.n_zones) | |
| for zone in range(self.n_zones): | |
| if np.random.random() < self.get_thief_probability(zone): | |
| thieves[zone] = 1 | |
| return thieves | |
| # ============================================================================== | |
| # PART 2: SENSOR | |
| # ============================================================================== | |
| class Sensor: | |
| def __init__(self, catch_probability=0.9): | |
| self.catch_prob = catch_probability | |
| def try_catch(self, thief_present): | |
| if thief_present: | |
| return np.random.random() < self.catch_prob | |
| return False | |
| # ============================================================================== | |
| # PART 3: ENVIRONMENT | |
| # ============================================================================== | |
| class SensorPlacementEnv: | |
| def __init__(self, n_sensors=4, hotspot1=2.5, hotspot2=7.0): | |
| self.world = ThiefWorld(hotspot1, hotspot2) | |
| self.sensor = Sensor() | |
| self.n_sensors = n_sensors | |
| self.n_zones = 10 | |
| self.reset() | |
| def reset(self): | |
| self.zone_attempts = np.zeros(self.n_zones) | |
| self.zone_catches = np.zeros(self.n_zones) | |
| self.day = 0 | |
| self.total_caught = 0 | |
| self.total_thieves = 0 | |
| return self._get_state() | |
| def _get_state(self): | |
| if self.zone_attempts.sum() == 0: | |
| return (0, 0) | |
| most_tried = int(np.argmax(self.zone_attempts)) | |
| catch_rates = np.zeros(self.n_zones) | |
| for z in range(self.n_zones): | |
| if self.zone_attempts[z] > 0: | |
| catch_rates[z] = self.zone_catches[z] / self.zone_attempts[z] | |
| best_zone = int(np.argmax(catch_rates)) | |
| return (most_tried, best_zone) | |
| def step(self, action): | |
| thieves = self.world.generate_thieves() | |
| n_thieves = int(thieves.sum()) | |
| self.total_thieves += n_thieves | |
| caught = 0 | |
| for zone in action: | |
| if zone < self.n_zones: | |
| self.zone_attempts[zone] += 1 | |
| if thieves[zone] == 1: | |
| if self.sensor.try_catch(True): | |
| caught += 1 | |
| self.zone_catches[zone] += 1 | |
| self.total_caught += caught | |
| self.day += 1 | |
| reward = caught + 0.1 * len(set(action)) | |
| done = self.day >= 30 | |
| return self._get_state(), reward, done, {'caught': caught} | |
| # ============================================================================== | |
| # PART 4: Q-LEARNING AGENT | |
| # ============================================================================== | |
| class QLearningAgent: | |
| def __init__(self): | |
| self.q_table = defaultdict(lambda: defaultdict(float)) | |
| self.learning_rate = 0.1 | |
| self.discount_factor = 0.95 | |
| self.epsilon = 1.0 | |
| self.epsilon_decay = 0.995 | |
| self.epsilon_min = 0.01 | |
| def _get_possible_actions(self): | |
| return [ | |
| (1, 3, 6, 8), (0, 3, 6, 9), (2, 4, 6, 8), | |
| (0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5), | |
| (5, 6, 7, 8), (6, 7, 8, 9), (4, 5, 6, 7), | |
| (2, 3, 7, 8), (1, 2, 6, 7), (2, 3, 6, 7), | |
| (3, 4, 5, 6), (0, 2, 5, 9), (1, 4, 7, 9), | |
| ] | |
| def choose_action(self, state): | |
| actions = self._get_possible_actions() | |
| if np.random.random() < self.epsilon: | |
| return actions[np.random.randint(len(actions))] | |
| else: | |
| best_action = None | |
| best_value = -999999 | |
| for action in actions: | |
| value = self.q_table[state][action] | |
| if value > best_value: | |
| best_value = value | |
| best_action = action | |
| if best_action is None: | |
| best_action = actions[np.random.randint(len(actions))] | |
| return best_action | |
| def learn(self, state, action, reward, next_state, done): | |
| old_q = self.q_table[state][action] | |
| if done: | |
| max_future_q = 0 | |
| else: | |
| actions = self._get_possible_actions() | |
| max_future_q = max([self.q_table[next_state][a] for a in actions]) | |
| target = reward + self.discount_factor * max_future_q | |
| new_q = old_q + self.learning_rate * (target - old_q) | |
| self.q_table[state][action] = new_q | |
| def decay_epsilon(self): | |
| self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) | |
| # ============================================================================== | |
| # TRAINING AND TESTING FUNCTIONS | |
| # ============================================================================== | |
| def train_and_test(n_episodes, hotspot1, hotspot2, progress=gr.Progress()): | |
| """Train AI and compare with other strategies.""" | |
| np.random.seed(42) | |
| # Training | |
| env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2) | |
| agent = QLearningAgent() | |
| episode_rewards = [] | |
| episode_catch_rates = [] | |
| epsilon_history = [] | |
| for episode in progress.tqdm(range(n_episodes), desc="Training AI"): | |
| state = env.reset() | |
| total_reward = 0 | |
| for day in range(30): | |
| action = agent.choose_action(state) | |
| next_state, reward, done, _ = env.step(action) | |
| agent.learn(state, action, reward, next_state, done) | |
| state = next_state | |
| total_reward += reward | |
| if done: | |
| break | |
| agent.decay_epsilon() | |
| episode_rewards.append(total_reward) | |
| catch_rate = env.total_caught / max(env.total_thieves, 1) * 100 | |
| episode_catch_rates.append(catch_rate) | |
| epsilon_history.append(agent.epsilon) | |
| # Testing | |
| n_tests = 50 | |
| results = {} | |
| # Q-Learning AI | |
| agent.epsilon = 0 | |
| catches = [] | |
| for _ in range(n_tests): | |
| state = env.reset() | |
| for day in range(30): | |
| action = agent.choose_action(state) | |
| state, _, done, _ = env.step(action) | |
| if done: | |
| break | |
| catches.append(env.total_caught / max(env.total_thieves, 1) * 100) | |
| results['Q-Learning AI'] = np.mean(catches) | |
| # Random | |
| catches = [] | |
| for _ in range(n_tests): | |
| env.reset() | |
| for day in range(30): | |
| action = tuple(np.random.choice(10, 4, replace=False)) | |
| _, _, done, _ = env.step(action) | |
| if done: | |
| break | |
| catches.append(env.total_caught / max(env.total_thieves, 1) * 100) | |
| results['Random'] = np.mean(catches) | |
| # Static | |
| catches = [] | |
| for _ in range(n_tests): | |
| env.reset() | |
| for day in range(30): | |
| _, _, done, _ = env.step((1, 3, 6, 8)) | |
| if done: | |
| break | |
| catches.append(env.total_caught / max(env.total_thieves, 1) * 100) | |
| results['Static Uniform'] = np.mean(catches) | |
| # Perfect | |
| h1_zone = int(hotspot1) | |
| h2_zone = int(hotspot2) | |
| perfect_action = (h1_zone, h1_zone+1, h2_zone, h2_zone+1) | |
| perfect_action = tuple(min(z, 9) for z in perfect_action) | |
| catches = [] | |
| for _ in range(n_tests): | |
| env.reset() | |
| for day in range(30): | |
| _, _, done, _ = env.step(perfect_action) | |
| if done: | |
| break | |
| catches.append(env.total_caught / max(env.total_thieves, 1) * 100) | |
| results['Perfect (Cheating)'] = np.mean(catches) | |
| # Create plots | |
| fig = plt.figure(figsize=(16, 12)) | |
| # Plot 1: Learning curve | |
| ax1 = fig.add_subplot(2, 2, 1) | |
| window = max(10, n_episodes // 20) | |
| if len(episode_catch_rates) >= window: | |
| smoothed = np.convolve(episode_catch_rates, np.ones(window)/window, mode='valid') | |
| ax1.plot(episode_catch_rates, alpha=0.3, color='green', label='Raw') | |
| ax1.plot(range(window-1, len(episode_catch_rates)), smoothed, | |
| color='green', linewidth=2, label='Smoothed') | |
| else: | |
| ax1.plot(episode_catch_rates, color='green', linewidth=2) | |
| ax1.set_xlabel('Episode', fontsize=12) | |
| ax1.set_ylabel('Catch Rate (%)', fontsize=12) | |
| ax1.set_title('๐ AI Learning Progress', fontsize=14) | |
| ax1.legend() | |
| ax1.grid(True, alpha=0.3) | |
| # Plot 2: Epsilon decay | |
| ax2 = fig.add_subplot(2, 2, 2) | |
| ax2.plot(epsilon_history, color='purple', linewidth=2) | |
| ax2.set_xlabel('Episode', fontsize=12) | |
| ax2.set_ylabel('Epsilon (Exploration Rate)', fontsize=12) | |
| ax2.set_title('๐ Explore vs Exploit Balance', fontsize=14) | |
| ax2.grid(True, alpha=0.3) | |
| # Add annotations | |
| ax2.annotate('100% Random\n(Exploring)', xy=(0, 1), fontsize=10, | |
| xytext=(n_episodes*0.1, 0.8), arrowprops=dict(arrowstyle='->', color='gray')) | |
| ax2.annotate('Mostly Using\nKnowledge', xy=(n_episodes-1, epsilon_history[-1]), fontsize=10, | |
| xytext=(n_episodes*0.7, 0.3), arrowprops=dict(arrowstyle='->', color='gray')) | |
| # Plot 3: What AI learned vs Truth | |
| ax3 = fig.add_subplot(2, 2, 3) | |
| zone_values = np.zeros(10) | |
| zone_counts = np.zeros(10) | |
| for state, actions in agent.q_table.items(): | |
| for action, value in actions.items(): | |
| for zone in action: | |
| zone_values[zone] += value | |
| zone_counts[zone] += 1 | |
| zone_counts[zone_counts == 0] = 1 | |
| learned = zone_values / zone_counts | |
| world = ThiefWorld(hotspot1, hotspot2) | |
| truth = [world.get_thief_probability(z) for z in range(10)] | |
| x = np.arange(10) | |
| width = 0.35 | |
| ax3.bar(x - width/2, learned / max(learned.max(), 0.01), width, | |
| label='AI Learned', color='blue', alpha=0.7) | |
| ax3.bar(x + width/2, np.array(truth) / max(truth), width, | |
| label='True Probability', color='red', alpha=0.7) | |
| ax3.axvline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1 ({hotspot1})') | |
| ax3.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2 ({hotspot2})') | |
| ax3.set_xlabel('Zone', fontsize=12) | |
| ax3.set_ylabel('Normalized Value', fontsize=12) | |
| ax3.set_title('๐ง Did AI Learn the Truth?', fontsize=14) | |
| ax3.legend(loc='upper right') | |
| ax3.grid(True, alpha=0.3) | |
| ax3.set_xticks(range(10)) | |
| # Plot 4: Final comparison | |
| ax4 = fig.add_subplot(2, 2, 4) | |
| names = list(results.keys()) | |
| values = list(results.values()) | |
| colors = ['green', 'gray', 'orange', 'blue'] | |
| bars = ax4.bar(names, values, color=colors, alpha=0.7, edgecolor='black') | |
| for bar, val in zip(bars, values): | |
| ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1, | |
| f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold') | |
| ax4.set_ylabel('Catch Rate (%)', fontsize=12) | |
| ax4.set_title('๐ Final Comparison', fontsize=14) | |
| ax4.grid(True, alpha=0.3, axis='y') | |
| plt.setp(ax4.xaxis.get_majorticklabels(), rotation=15, ha='right') | |
| plt.tight_layout() | |
| # Results text | |
| results_text = f""" | |
| ## ๐ฏ Training Complete! | |
| ### Training Summary: | |
| - Episodes trained: **{n_episodes}** | |
| - Hotspot 1: Zone **{hotspot1}** | |
| - Hotspot 2: Zone **{hotspot2}** | |
| - Final exploration rate: **{epsilon_history[-1]*100:.1f}%** | |
| ### ๐ Test Results (50 test runs each): | |
| | Strategy | Catch Rate | | |
| |----------|------------| | |
| | ๐ **Q-Learning AI** | **{results['Q-Learning AI']:.1f}%** | | |
| | Random | {results['Random']:.1f}% | | |
| | Static Uniform | {results['Static Uniform']:.1f}% | | |
| | Perfect (Cheating) | {results['Perfect (Cheating)']:.1f}% | | |
| ### ๐ง What AI Learned: | |
| The AI discovered that zones **{int(hotspot1)}** and **{int(hotspot2)}** have more thieves! | |
| ### ๐ Key Insight: | |
| AI started knowing **NOTHING** and learned through **trial and error**! | |
| """ | |
| return fig, results_text | |
| def explain_qlearning(): | |
| """Create explanation visualization.""" | |
| fig, axes = plt.subplots(1, 2, figsize=(14, 5)) | |
| # Plot 1: Q-Learning cycle | |
| ax1 = axes[0] | |
| ax1.axis('off') | |
| # Draw cycle | |
| cycle_text = """ | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| โ Q-LEARNING CYCLE โ | |
| โ โ | |
| โ โโโโโโโโโโโ โ | |
| โ โ STATE โ โ | |
| โ โ(What AI โ โ | |
| โ โ sees) โ โ | |
| โ โโโโโโฌโโโโโ โ | |
| โ โ โ | |
| โ โผ โ | |
| โ โโโโโโโโโโโโ โโโโโโโโโโโ โโโโโโโโโโโโ โ | |
| โ โ UPDATE โโโโโโโ ACTION โโโโโโบโ REWARD โ โ | |
| โ โ Q-TABLE โ โ(Place โ โ(Caught โ โ | |
| โ โ(Remember)โ โsensors) โ โthieves?) โ โ | |
| โ โโโโโโโโโโโโ โโโโโโโโโโโ โโโโโโโโโโโโ โ | |
| โ โ โ โ | |
| โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ | |
| โ REPEAT! โ | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| """ | |
| ax1.text(0.5, 0.5, cycle_text, transform=ax1.transAxes, fontsize=10, | |
| verticalalignment='center', horizontalalignment='center', | |
| fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='lightyellow')) | |
| ax1.set_title('How Q-Learning Works', fontsize=14) | |
| # Plot 2: Epsilon explanation | |
| ax2 = axes[1] | |
| episodes = np.arange(500) | |
| epsilon = 1.0 * (0.995 ** episodes) | |
| epsilon = np.maximum(epsilon, 0.01) | |
| ax2.fill_between(episodes, epsilon, alpha=0.3, color='blue', label='EXPLORE') | |
| ax2.fill_between(episodes, 0, 1-epsilon, alpha=0.3, color='green', label='EXPLOIT') | |
| ax2.plot(episodes, epsilon, 'b-', linewidth=2) | |
| ax2.plot(episodes, 1-epsilon, 'g-', linewidth=2) | |
| ax2.axvline(50, color='gray', linestyle='--', alpha=0.5) | |
| ax2.axvline(200, color='gray', linestyle='--', alpha=0.5) | |
| ax2.axvline(400, color='gray', linestyle='--', alpha=0.5) | |
| ax2.text(25, 0.5, 'Early:\n80% Explore', fontsize=9, ha='center') | |
| ax2.text(125, 0.5, 'Middle:\n50-50', fontsize=9, ha='center') | |
| ax2.text(300, 0.5, 'Late:\n80% Exploit', fontsize=9, ha='center') | |
| ax2.set_xlabel('Episode', fontsize=12) | |
| ax2.set_ylabel('Probability', fontsize=12) | |
| ax2.set_title('Explore vs Exploit Over Time', fontsize=14) | |
| ax2.legend(loc='center right') | |
| ax2.grid(True, alpha=0.3) | |
| plt.tight_layout() | |
| return fig | |
| def show_environment(hotspot1, hotspot2): | |
| """Visualize the thief world.""" | |
| fig, ax = plt.subplots(figsize=(12, 5)) | |
| world = ThiefWorld(hotspot1, hotspot2) | |
| zones = np.arange(10) | |
| probs = [world.get_thief_probability(z) for z in zones] | |
| colors = ['red' if p > 0.4 else 'orange' if p > 0.2 else 'green' for p in probs] | |
| bars = ax.bar(zones, probs, color=colors, alpha=0.7, edgecolor='black') | |
| for bar, prob in zip(bars, probs): | |
| ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, | |
| f'{prob*100:.0f}%', ha='center', fontsize=10, fontweight='bold') | |
| ax.axvline(hotspot1, color='red', linestyle='--', linewidth=2, label=f'Hotspot 1 ({hotspot1})') | |
| ax.axvline(hotspot2, color='darkred', linestyle='--', linewidth=2, label=f'Hotspot 2 ({hotspot2})') | |
| ax.set_xlabel('Zone', fontsize=12) | |
| ax.set_ylabel('Thief Probability', fontsize=12) | |
| ax.set_title('๐ฆน Secret Thief Locations (AI Must Discover This!)', fontsize=14) | |
| ax.set_xticks(zones) | |
| ax.legend() | |
| ax.grid(True, alpha=0.3, axis='y') | |
| plt.tight_layout() | |
| return fig | |
| def simulate_one_episode(hotspot1, hotspot2): | |
| """Simulate and visualize one episode.""" | |
| np.random.seed(None) # Random seed for variety | |
| env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2) | |
| agent = QLearningAgent() | |
| agent.epsilon = 0.5 # 50% explore for demo | |
| state = env.reset() | |
| # Track daily data | |
| daily_actions = [] | |
| daily_caught = [] | |
| daily_thieves = [] | |
| for day in range(30): | |
| action = agent.choose_action(state) | |
| daily_actions.append(action) | |
| old_caught = env.total_caught | |
| old_thieves = env.total_thieves | |
| state, reward, done, info = env.step(action) | |
| daily_caught.append(env.total_caught - old_caught) | |
| daily_thieves.append(env.total_thieves - old_thieves) | |
| agent.learn(state, action, reward, state, done) | |
| # Create visualization | |
| fig, axes = plt.subplots(2, 2, figsize=(14, 10)) | |
| # Plot 1: Sensor placements over days | |
| ax1 = axes[0, 0] | |
| for day, action in enumerate(daily_actions): | |
| for zone in action: | |
| ax1.scatter(day, zone, c='blue', s=30, alpha=0.6) | |
| ax1.axhline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1') | |
| ax1.axhline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2') | |
| ax1.set_xlabel('Day', fontsize=12) | |
| ax1.set_ylabel('Zone', fontsize=12) | |
| ax1.set_title('๐ Where AI Placed Sensors Each Day', fontsize=14) | |
| ax1.legend() | |
| ax1.grid(True, alpha=0.3) | |
| ax1.set_yticks(range(10)) | |
| # Plot 2: Daily catches | |
| ax2 = axes[0, 1] | |
| days = range(1, 31) | |
| ax2.bar(days, daily_caught, color='green', alpha=0.7, label='Caught') | |
| ax2.plot(days, daily_thieves, 'ro-', markersize=5, label='Total Thieves') | |
| ax2.set_xlabel('Day', fontsize=12) | |
| ax2.set_ylabel('Count', fontsize=12) | |
| ax2.set_title('๐ฏ Daily Catches', fontsize=14) | |
| ax2.legend() | |
| ax2.grid(True, alpha=0.3) | |
| # Plot 3: Cumulative performance | |
| ax3 = axes[1, 0] | |
| cum_caught = np.cumsum(daily_caught) | |
| cum_thieves = np.cumsum(daily_thieves) | |
| ax3.fill_between(days, cum_caught, alpha=0.3, color='green') | |
| ax3.plot(days, cum_caught, 'g-', linewidth=2, label='Cumulative Caught') | |
| ax3.plot(days, cum_thieves, 'r--', linewidth=2, label='Cumulative Thieves') | |
| ax3.set_xlabel('Day', fontsize=12) | |
| ax3.set_ylabel('Cumulative Count', fontsize=12) | |
| ax3.set_title('๐ Cumulative Performance', fontsize=14) | |
| ax3.legend() | |
| ax3.grid(True, alpha=0.3) | |
| # Plot 4: Zone usage | |
| ax4 = axes[1, 1] | |
| zone_usage = np.zeros(10) | |
| for action in daily_actions: | |
| for zone in action: | |
| zone_usage[zone] += 1 | |
| colors = ['blue' if z in [int(hotspot1), int(hotspot1)+1, int(hotspot2), int(hotspot2)+1] | |
| else 'gray' for z in range(10)] | |
| ax4.bar(range(10), zone_usage, color=colors, alpha=0.7, edgecolor='black') | |
| ax4.axvline(hotspot1, color='red', linestyle='--', alpha=0.5) | |
| ax4.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5) | |
| ax4.set_xlabel('Zone', fontsize=12) | |
| ax4.set_ylabel('Times Used', fontsize=12) | |
| ax4.set_title('๐บ๏ธ Zone Usage (Blue = Near Hotspots)', fontsize=14) | |
| ax4.set_xticks(range(10)) | |
| ax4.grid(True, alpha=0.3, axis='y') | |
| plt.tight_layout() | |
| # Summary | |
| catch_rate = env.total_caught / max(env.total_thieves, 1) * 100 | |
| summary = f""" | |
| ## ๐ Episode Summary | |
| - **Total Thieves:** {env.total_thieves} | |
| - **Total Caught:** {env.total_caught} | |
| - **Catch Rate:** {catch_rate:.1f}% | |
| ### Zones Most Used: | |
| {', '.join([f'Zone {i}' for i in np.argsort(zone_usage)[-3:][::-1]])} | |
| ### Note: | |
| This is just ONE episode with 50% exploration. | |
| Train for 500+ episodes to see real learning! | |
| """ | |
| return fig, summary | |
| # ============================================================================== | |
| # GRADIO INTERFACE | |
| # ============================================================================== | |
| with gr.Blocks(title="Q-Learning AI Demo", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # ๐ค Q-Learning AI for Sensor Placement | |
| **Watch an AI learn where to place sensors to catch thieves!** | |
| The AI starts knowing NOTHING and learns through trial-and-error. | |
| --- | |
| """) | |
| with gr.Tabs(): | |
| # ==== TAB 1: Explanation ==== | |
| with gr.TabItem("1๏ธโฃ What is Q-Learning?"): | |
| gr.Markdown(""" | |
| ## ๐ Q-Learning Explained Simply | |
| ### Like Teaching a Dog: | |
| ``` | |
| 1. Dog tries something โ 2. Gets treat (or not) โ 3. Remembers โ 4. Gets smarter! | |
| ``` | |
| ### For Our AI: | |
| ``` | |
| 1. AI places sensors โ 2. Catches thieves (reward!) โ 3. Updates Q-Table โ 4. Gets smarter! | |
| ``` | |
| ### The Q-Table (AI's Memory): | |
| | State | Action | Expected Reward | | |
| |-------|--------|-----------------| | |
| | "Day 1" | Zones (1,3,6,8) | 1.5 points | | |
| | "Day 1" | Zones (2,3,7,8) | 3.2 points โ Better! | | |
| ### Explore vs Exploit: | |
| - **EXPLORE**: Try random things to learn | |
| - **EXPLOIT**: Use what you already know | |
| Early training โ More EXPLORE | |
| Late training โ More EXPLOIT | |
| """) | |
| explain_btn = gr.Button("๐ Show Visual Explanation", variant="primary") | |
| explain_plot = gr.Plot() | |
| explain_btn.click(explain_qlearning, outputs=explain_plot) | |
| # ==== TAB 2: Environment ==== | |
| with gr.TabItem("2๏ธโฃ The Secret World"): | |
| gr.Markdown(""" | |
| ## ๐ฆน Where Do Thieves Appear? | |
| The AI doesn't know this! It must DISCOVER it through learning. | |
| Adjust the hotspot locations and see the thief distribution: | |
| """) | |
| with gr.Row(): | |
| h1_slider = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1 Location") | |
| h2_slider = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2 Location") | |
| env_btn = gr.Button("๐บ๏ธ Show Thief Distribution", variant="primary") | |
| env_plot = gr.Plot() | |
| env_btn.click(show_environment, [h1_slider, h2_slider], env_plot) | |
| # ==== TAB 3: One Episode ==== | |
| with gr.TabItem("3๏ธโฃ Watch One Episode"): | |
| gr.Markdown(""" | |
| ## ๐ See One Month (30 Days) of Simulation | |
| Watch how AI makes decisions and catches thieves. | |
| (Note: This is untrained AI with 50% exploration rate) | |
| """) | |
| with gr.Row(): | |
| h1_ep = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1") | |
| h2_ep = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2") | |
| ep_btn = gr.Button("โถ๏ธ Run One Episode", variant="primary") | |
| ep_plot = gr.Plot() | |
| ep_summary = gr.Markdown() | |
| ep_btn.click(simulate_one_episode, [h1_ep, h2_ep], [ep_plot, ep_summary]) | |
| # ==== TAB 4: Full Training ==== | |
| with gr.TabItem("4๏ธโฃ Train the AI!"): | |
| gr.Markdown(""" | |
| ## ๐๏ธ Train Q-Learning AI | |
| Train the AI and compare it against other strategies! | |
| โ ๏ธ Training takes a few seconds depending on episodes. | |
| """) | |
| with gr.Row(): | |
| episodes_slider = gr.Slider(100, 1000, value=300, step=50, | |
| label="Number of Episodes") | |
| with gr.Row(): | |
| h1_train = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1") | |
| h2_train = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2") | |
| train_btn = gr.Button("๐ Train AI!", variant="primary", size="lg") | |
| train_plot = gr.Plot() | |
| train_results = gr.Markdown() | |
| train_btn.click(train_and_test, | |
| [episodes_slider, h1_train, h2_train], | |
| [train_plot, train_results]) | |
| # ==== TAB 5: Summary ==== | |
| with gr.TabItem("5๏ธโฃ Key Concepts"): | |
| gr.Markdown(""" | |
| ## ๐ Summary: Q-Learning Key Concepts | |
| ### 1. Q-Table | |
| ``` | |
| A "cheat sheet" that stores: | |
| "In STATE X, if I do ACTION Y, I expect REWARD Z" | |
| ``` | |
| ### 2. State | |
| ``` | |
| What the AI "sees" at any moment. | |
| Example: (most_tried_zone, best_zone_so_far) | |
| ``` | |
| ### 3. Action | |
| ``` | |
| What the AI can do. | |
| Example: Place sensors in zones (2, 3, 7, 8) | |
| ``` | |
| ### 4. Reward | |
| ``` | |
| Points for good actions. | |
| Example: +1 for each thief caught | |
| ``` | |
| ### 5. Epsilon (ฮต) | |
| ``` | |
| Exploration rate. | |
| ฮต = 1.0 โ 100% random (exploring) | |
| ฮต = 0.01 โ 1% random (exploiting knowledge) | |
| ``` | |
| ### 6. Learning Formula | |
| ``` | |
| Q(s,a) = Q(s,a) + ฮฑ ร (reward + ฮณ ร max(Q(s',a')) - Q(s,a)) | |
| In simple terms: | |
| New Memory = Old Memory + Learning Rate ร (Reality - Expectation) | |
| ``` | |
| --- | |
| ## ๐ฏ Why This Matters | |
| This same technique is used in: | |
| - ๐ฎ Game AI (AlphaGo, Chess engines) | |
| - ๐ Self-driving cars | |
| - ๐ค Robots | |
| - ๐ฑ Recommendation systems | |
| **You just learned how real AI works!** ๐ | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| ### ๐ About | |
| This demo shows **Q-Learning Reinforcement Learning** for sensor placement. | |
| The AI learns through trial-and-error, just like humans! | |
| """) | |
| # Launch | |
| if __name__ == "__main__": | |
| demo.launch() |