"""
Q-Learning AI for Sensor Placement - Interactive Demo
For Hugging Face Spaces
"""

import numpy as np
import matplotlib.pyplot as plt
import gradio as gr
from collections import defaultdict

np.random.seed(42)


# ==============================================================================
# PART 1: THE SECRET WORLD
# ==============================================================================

class ThiefWorld:
    """Where thieves REALLY appear (AI must discover this!)"""
    
    def __init__(self, hotspot1=2.5, hotspot2=7.0):
        self.hotspot1 = hotspot1
        self.hotspot2 = hotspot2
        self.n_zones = 10
        
    def get_thief_probability(self, zone):
        zone_center = zone + 0.5
        prob = (
            0.6 * np.exp(-((zone_center - self.hotspot1)**2) / 1.0) +
            0.4 * np.exp(-((zone_center - self.hotspot2)**2) / 0.8) +
            0.05
        )
        return min(prob, 1.0)
    
    def generate_thieves(self):
        thieves = np.zeros(self.n_zones)
        for zone in range(self.n_zones):
            if np.random.random() < self.get_thief_probability(zone):
                thieves[zone] = 1
        return thieves


# ==============================================================================
# PART 2: SENSOR
# ==============================================================================

class Sensor:
    def __init__(self, catch_probability=0.9):
        self.catch_prob = catch_probability
    
    def try_catch(self, thief_present):
        if thief_present:
            return np.random.random() < self.catch_prob
        return False


# ==============================================================================
# PART 3: ENVIRONMENT
# ==============================================================================

class SensorPlacementEnv:
    def __init__(self, n_sensors=4, hotspot1=2.5, hotspot2=7.0):
        self.world = ThiefWorld(hotspot1, hotspot2)
        self.sensor = Sensor()
        self.n_sensors = n_sensors
        self.n_zones = 10
        self.reset()
        
    def reset(self):
        self.zone_attempts = np.zeros(self.n_zones)
        self.zone_catches = np.zeros(self.n_zones)
        self.day = 0
        self.total_caught = 0
        self.total_thieves = 0
        return self._get_state()
    
    def _get_state(self):
        if self.zone_attempts.sum() == 0:
            return (0, 0)
        most_tried = int(np.argmax(self.zone_attempts))
        catch_rates = np.zeros(self.n_zones)
        for z in range(self.n_zones):
            if self.zone_attempts[z] > 0:
                catch_rates[z] = self.zone_catches[z] / self.zone_attempts[z]
        best_zone = int(np.argmax(catch_rates))
        return (most_tried, best_zone)
    
    def step(self, action):
        thieves = self.world.generate_thieves()
        n_thieves = int(thieves.sum())
        self.total_thieves += n_thieves
        
        caught = 0
        for zone in action:
            if zone < self.n_zones:
                self.zone_attempts[zone] += 1
                if thieves[zone] == 1:
                    if self.sensor.try_catch(True):
                        caught += 1
                        self.zone_catches[zone] += 1
        
        self.total_caught += caught
        self.day += 1
        reward = caught + 0.1 * len(set(action))
        done = self.day >= 30
        
        return self._get_state(), reward, done, {'caught': caught}


# ==============================================================================
# PART 4: Q-LEARNING AGENT
# ==============================================================================

class QLearningAgent:
    def __init__(self):
        self.q_table = defaultdict(lambda: defaultdict(float))
        self.learning_rate = 0.1
        self.discount_factor = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
    
    def _get_possible_actions(self):
        return [
            (1, 3, 6, 8), (0, 3, 6, 9), (2, 4, 6, 8),
            (0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5),
            (5, 6, 7, 8), (6, 7, 8, 9), (4, 5, 6, 7),
            (2, 3, 7, 8), (1, 2, 6, 7), (2, 3, 6, 7),
            (3, 4, 5, 6), (0, 2, 5, 9), (1, 4, 7, 9),
        ]
    
    def choose_action(self, state):
        actions = self._get_possible_actions()
        if np.random.random() < self.epsilon:
            return actions[np.random.randint(len(actions))]
        else:
            best_action = None
            best_value = -999999
            for action in actions:
                value = self.q_table[state][action]
                if value > best_value:
                    best_value = value
                    best_action = action
            if best_action is None:
                best_action = actions[np.random.randint(len(actions))]
            return best_action
    
    def learn(self, state, action, reward, next_state, done):
        old_q = self.q_table[state][action]
        if done:
            max_future_q = 0
        else:
            actions = self._get_possible_actions()
            max_future_q = max([self.q_table[next_state][a] for a in actions])
        target = reward + self.discount_factor * max_future_q
        new_q = old_q + self.learning_rate * (target - old_q)
        self.q_table[state][action] = new_q
    
    def decay_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)


# ==============================================================================
# TRAINING AND TESTING FUNCTIONS
# ==============================================================================

def train_and_test(n_episodes, hotspot1, hotspot2, progress=gr.Progress()):
    """Train AI and compare with other strategies."""
    
    np.random.seed(42)
    
    # Training
    env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
    agent = QLearningAgent()
    
    episode_rewards = []
    episode_catch_rates = []
    epsilon_history = []
    
    for episode in progress.tqdm(range(n_episodes), desc="Training AI"):
        state = env.reset()
        total_reward = 0
        
        for day in range(30):
            action = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            if done:
                break
        
        agent.decay_epsilon()
        episode_rewards.append(total_reward)
        catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
        episode_catch_rates.append(catch_rate)
        epsilon_history.append(agent.epsilon)
    
    # Testing
    n_tests = 50
    results = {}
    
    # Q-Learning AI
    agent.epsilon = 0
    catches = []
    for _ in range(n_tests):
        state = env.reset()
        for day in range(30):
            action = agent.choose_action(state)
            state, _, done, _ = env.step(action)
            if done:
                break
        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
    results['Q-Learning AI'] = np.mean(catches)
    
    # Random
    catches = []
    for _ in range(n_tests):
        env.reset()
        for day in range(30):
            action = tuple(np.random.choice(10, 4, replace=False))
            _, _, done, _ = env.step(action)
            if done:
                break
        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
    results['Random'] = np.mean(catches)
    
    # Static
    catches = []
    for _ in range(n_tests):
        env.reset()
        for day in range(30):
            _, _, done, _ = env.step((1, 3, 6, 8))
            if done:
                break
        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
    results['Static Uniform'] = np.mean(catches)
    
    # Perfect
    h1_zone = int(hotspot1)
    h2_zone = int(hotspot2)
    perfect_action = (h1_zone, h1_zone+1, h2_zone, h2_zone+1)
    perfect_action = tuple(min(z, 9) for z in perfect_action)
    catches = []
    for _ in range(n_tests):
        env.reset()
        for day in range(30):
            _, _, done, _ = env.step(perfect_action)
            if done:
                break
        catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
    results['Perfect (Cheating)'] = np.mean(catches)
    
    # Create plots
    fig = plt.figure(figsize=(16, 12))
    
    # Plot 1: Learning curve
    ax1 = fig.add_subplot(2, 2, 1)
    window = max(10, n_episodes // 20)
    if len(episode_catch_rates) >= window:
        smoothed = np.convolve(episode_catch_rates, np.ones(window)/window, mode='valid')
        ax1.plot(episode_catch_rates, alpha=0.3, color='green', label='Raw')
        ax1.plot(range(window-1, len(episode_catch_rates)), smoothed, 
                color='green', linewidth=2, label='Smoothed')
    else:
        ax1.plot(episode_catch_rates, color='green', linewidth=2)
    ax1.set_xlabel('Episode', fontsize=12)
    ax1.set_ylabel('Catch Rate (%)', fontsize=12)
    ax1.set_title('🎓 AI Learning Progress', fontsize=14)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Epsilon decay
    ax2 = fig.add_subplot(2, 2, 2)
    ax2.plot(epsilon_history, color='purple', linewidth=2)
    ax2.set_xlabel('Episode', fontsize=12)
    ax2.set_ylabel('Epsilon (Exploration Rate)', fontsize=12)
    ax2.set_title('🔍 Explore vs Exploit Balance', fontsize=14)
    ax2.grid(True, alpha=0.3)
    
    # Add annotations
    ax2.annotate('100% Random\n(Exploring)', xy=(0, 1), fontsize=10,
                xytext=(n_episodes*0.1, 0.8), arrowprops=dict(arrowstyle='->', color='gray'))
    ax2.annotate('Mostly Using\nKnowledge', xy=(n_episodes-1, epsilon_history[-1]), fontsize=10,
                xytext=(n_episodes*0.7, 0.3), arrowprops=dict(arrowstyle='->', color='gray'))
    
    # Plot 3: What AI learned vs Truth
    ax3 = fig.add_subplot(2, 2, 3)
    
    zone_values = np.zeros(10)
    zone_counts = np.zeros(10)
    for state, actions in agent.q_table.items():
        for action, value in actions.items():
            for zone in action:
                zone_values[zone] += value
                zone_counts[zone] += 1
    zone_counts[zone_counts == 0] = 1
    learned = zone_values / zone_counts
    
    world = ThiefWorld(hotspot1, hotspot2)
    truth = [world.get_thief_probability(z) for z in range(10)]
    
    x = np.arange(10)
    width = 0.35
    ax3.bar(x - width/2, learned / max(learned.max(), 0.01), width, 
            label='AI Learned', color='blue', alpha=0.7)
    ax3.bar(x + width/2, np.array(truth) / max(truth), width, 
            label='True Probability', color='red', alpha=0.7)
    ax3.axvline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1 ({hotspot1})')
    ax3.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2 ({hotspot2})')
    ax3.set_xlabel('Zone', fontsize=12)
    ax3.set_ylabel('Normalized Value', fontsize=12)
    ax3.set_title('🧠 Did AI Learn the Truth?', fontsize=14)
    ax3.legend(loc='upper right')
    ax3.grid(True, alpha=0.3)
    ax3.set_xticks(range(10))
    
    # Plot 4: Final comparison
    ax4 = fig.add_subplot(2, 2, 4)
    names = list(results.keys())
    values = list(results.values())
    colors = ['green', 'gray', 'orange', 'blue']
    bars = ax4.bar(names, values, color=colors, alpha=0.7, edgecolor='black')
    
    for bar, val in zip(bars, values):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
                f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')
    
    ax4.set_ylabel('Catch Rate (%)', fontsize=12)
    ax4.set_title('🏆 Final Comparison', fontsize=14)
    ax4.grid(True, alpha=0.3, axis='y')
    plt.setp(ax4.xaxis.get_majorticklabels(), rotation=15, ha='right')
    
    plt.tight_layout()
    
    # Results text
    results_text = f"""
## 🎯 Training Complete!

### Training Summary:
- Episodes trained: **{n_episodes}**
- Hotspot 1: Zone **{hotspot1}**
- Hotspot 2: Zone **{hotspot2}**
- Final exploration rate: **{epsilon_history[-1]*100:.1f}%**

### 📊 Test Results (50 test runs each):

| Strategy | Catch Rate |
|----------|------------|
| 🏆 **Q-Learning AI** | **{results['Q-Learning AI']:.1f}%** |
| Random | {results['Random']:.1f}% |
| Static Uniform | {results['Static Uniform']:.1f}% |
| Perfect (Cheating) | {results['Perfect (Cheating)']:.1f}% |

### 🧠 What AI Learned:
The AI discovered that zones **{int(hotspot1)}** and **{int(hotspot2)}** have more thieves!

### 🎓 Key Insight:
AI started knowing **NOTHING** and learned through **trial and error**!
"""
    
    return fig, results_text


def explain_qlearning():
    """Create explanation visualization."""
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot 1: Q-Learning cycle
    ax1 = axes[0]
    ax1.axis('off')
    
    # Draw cycle
    cycle_text = """
    ┌─────────────────────────────────────────────────────────────┐
    │                    Q-LEARNING CYCLE                         │
    │                                                             │
    │                      ┌─────────┐                            │
    │                      │  STATE  │                            │
    │                      │(What AI │                            │
    │                      │  sees)  │                            │
    │                      └────┬────┘                            │
    │                           │                                 │
    │                           ▼                                 │
    │    ┌──────────┐     ┌─────────┐     ┌──────────┐           │
    │    │  UPDATE  │◄────│ ACTION  │────►│  REWARD  │           │
    │    │ Q-TABLE  │     │(Place   │     │(Caught   │           │
    │    │(Remember)│     │sensors) │     │thieves?) │           │
    │    └──────────┘     └─────────┘     └──────────┘           │
    │         │                                 │                 │
    │         └─────────────────────────────────┘                 │
    │                     REPEAT!                                 │
    └─────────────────────────────────────────────────────────────┘
    """
    ax1.text(0.5, 0.5, cycle_text, transform=ax1.transAxes, fontsize=10,
             verticalalignment='center', horizontalalignment='center',
             fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='lightyellow'))
    ax1.set_title('How Q-Learning Works', fontsize=14)
    
    # Plot 2: Epsilon explanation
    ax2 = axes[1]
    episodes = np.arange(500)
    epsilon = 1.0 * (0.995 ** episodes)
    epsilon = np.maximum(epsilon, 0.01)
    
    ax2.fill_between(episodes, epsilon, alpha=0.3, color='blue', label='EXPLORE')
    ax2.fill_between(episodes, 0, 1-epsilon, alpha=0.3, color='green', label='EXPLOIT')
    ax2.plot(episodes, epsilon, 'b-', linewidth=2)
    ax2.plot(episodes, 1-epsilon, 'g-', linewidth=2)
    
    ax2.axvline(50, color='gray', linestyle='--', alpha=0.5)
    ax2.axvline(200, color='gray', linestyle='--', alpha=0.5)
    ax2.axvline(400, color='gray', linestyle='--', alpha=0.5)
    
    ax2.text(25, 0.5, 'Early:\n80% Explore', fontsize=9, ha='center')
    ax2.text(125, 0.5, 'Middle:\n50-50', fontsize=9, ha='center')
    ax2.text(300, 0.5, 'Late:\n80% Exploit', fontsize=9, ha='center')
    
    ax2.set_xlabel('Episode', fontsize=12)
    ax2.set_ylabel('Probability', fontsize=12)
    ax2.set_title('Explore vs Exploit Over Time', fontsize=14)
    ax2.legend(loc='center right')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig


def show_environment(hotspot1, hotspot2):
    """Visualize the thief world."""
    
    fig, ax = plt.subplots(figsize=(12, 5))
    
    world = ThiefWorld(hotspot1, hotspot2)
    zones = np.arange(10)
    probs = [world.get_thief_probability(z) for z in zones]
    
    colors = ['red' if p > 0.4 else 'orange' if p > 0.2 else 'green' for p in probs]
    bars = ax.bar(zones, probs, color=colors, alpha=0.7, edgecolor='black')
    
    for bar, prob in zip(bars, probs):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
               f'{prob*100:.0f}%', ha='center', fontsize=10, fontweight='bold')
    
    ax.axvline(hotspot1, color='red', linestyle='--', linewidth=2, label=f'Hotspot 1 ({hotspot1})')
    ax.axvline(hotspot2, color='darkred', linestyle='--', linewidth=2, label=f'Hotspot 2 ({hotspot2})')
    
    ax.set_xlabel('Zone', fontsize=12)
    ax.set_ylabel('Thief Probability', fontsize=12)
    ax.set_title('🦹 Secret Thief Locations (AI Must Discover This!)', fontsize=14)
    ax.set_xticks(zones)
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    return fig


def simulate_one_episode(hotspot1, hotspot2):
    """Simulate and visualize one episode."""
    
    np.random.seed(None)  # Random seed for variety
    
    env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
    agent = QLearningAgent()
    agent.epsilon = 0.5  # 50% explore for demo
    
    state = env.reset()
    
    # Track daily data
    daily_actions = []
    daily_caught = []
    daily_thieves = []
    
    for day in range(30):
        action = agent.choose_action(state)
        daily_actions.append(action)
        
        old_caught = env.total_caught
        old_thieves = env.total_thieves
        
        state, reward, done, info = env.step(action)
        
        daily_caught.append(env.total_caught - old_caught)
        daily_thieves.append(env.total_thieves - old_thieves)
        
        agent.learn(state, action, reward, state, done)
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Plot 1: Sensor placements over days
    ax1 = axes[0, 0]
    for day, action in enumerate(daily_actions):
        for zone in action:
            ax1.scatter(day, zone, c='blue', s=30, alpha=0.6)
    
    ax1.axhline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1')
    ax1.axhline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2')
    ax1.set_xlabel('Day', fontsize=12)
    ax1.set_ylabel('Zone', fontsize=12)
    ax1.set_title('📍 Where AI Placed Sensors Each Day', fontsize=14)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    ax1.set_yticks(range(10))
    
    # Plot 2: Daily catches
    ax2 = axes[0, 1]
    days = range(1, 31)
    ax2.bar(days, daily_caught, color='green', alpha=0.7, label='Caught')
    ax2.plot(days, daily_thieves, 'ro-', markersize=5, label='Total Thieves')
    ax2.set_xlabel('Day', fontsize=12)
    ax2.set_ylabel('Count', fontsize=12)
    ax2.set_title('🎯 Daily Catches', fontsize=14)
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Plot 3: Cumulative performance
    ax3 = axes[1, 0]
    cum_caught = np.cumsum(daily_caught)
    cum_thieves = np.cumsum(daily_thieves)
    ax3.fill_between(days, cum_caught, alpha=0.3, color='green')
    ax3.plot(days, cum_caught, 'g-', linewidth=2, label='Cumulative Caught')
    ax3.plot(days, cum_thieves, 'r--', linewidth=2, label='Cumulative Thieves')
    ax3.set_xlabel('Day', fontsize=12)
    ax3.set_ylabel('Cumulative Count', fontsize=12)
    ax3.set_title('📈 Cumulative Performance', fontsize=14)
    ax3.legend()
    ax3.grid(True, alpha=0.3)
    
    # Plot 4: Zone usage
    ax4 = axes[1, 1]
    zone_usage = np.zeros(10)
    for action in daily_actions:
        for zone in action:
            zone_usage[zone] += 1
    
    colors = ['blue' if z in [int(hotspot1), int(hotspot1)+1, int(hotspot2), int(hotspot2)+1] 
              else 'gray' for z in range(10)]
    ax4.bar(range(10), zone_usage, color=colors, alpha=0.7, edgecolor='black')
    ax4.axvline(hotspot1, color='red', linestyle='--', alpha=0.5)
    ax4.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5)
    ax4.set_xlabel('Zone', fontsize=12)
    ax4.set_ylabel('Times Used', fontsize=12)
    ax4.set_title('🗺️ Zone Usage (Blue = Near Hotspots)', fontsize=14)
    ax4.set_xticks(range(10))
    ax4.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    
    # Summary
    catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
    summary = f"""
## 📊 Episode Summary

- **Total Thieves:** {env.total_thieves}
- **Total Caught:** {env.total_caught}
- **Catch Rate:** {catch_rate:.1f}%

### Zones Most Used:
{', '.join([f'Zone {i}' for i in np.argsort(zone_usage)[-3:][::-1]])}

### Note:
This is just ONE episode with 50% exploration.
Train for 500+ episodes to see real learning!
"""
    
    return fig, summary


# ==============================================================================
# GRADIO INTERFACE
# ==============================================================================

with gr.Blocks(title="Q-Learning AI Demo", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("""
    # 🤖 Q-Learning AI for Sensor Placement
    
    **Watch an AI learn where to place sensors to catch thieves!**
    
    The AI starts knowing NOTHING and learns through trial-and-error.
    
    ---
    """)
    
    with gr.Tabs():
        
        # ==== TAB 1: Explanation ====
        with gr.TabItem("1️⃣ What is Q-Learning?"):
            gr.Markdown("""
            ## 🎓 Q-Learning Explained Simply
            
            ### Like Teaching a Dog:
            ```
            1. Dog tries something → 2. Gets treat (or not) → 3. Remembers → 4. Gets smarter!
            ```
            
            ### For Our AI:
            ```
            1. AI places sensors → 2. Catches thieves (reward!) → 3. Updates Q-Table → 4. Gets smarter!
            ```
            
            ### The Q-Table (AI's Memory):
            
            | State | Action | Expected Reward |
            |-------|--------|-----------------|
            | "Day 1" | Zones (1,3,6,8) | 1.5 points |
            | "Day 1" | Zones (2,3,7,8) | 3.2 points ← Better! |
            
            ### Explore vs Exploit:
            - **EXPLORE**: Try random things to learn
            - **EXPLOIT**: Use what you already know
            
            Early training → More EXPLORE
            Late training → More EXPLOIT
            """)
            
            explain_btn = gr.Button("📊 Show Visual Explanation", variant="primary")
            explain_plot = gr.Plot()
            explain_btn.click(explain_qlearning, outputs=explain_plot)
        
        # ==== TAB 2: Environment ====
        with gr.TabItem("2️⃣ The Secret World"):
            gr.Markdown("""
            ## 🦹 Where Do Thieves Appear?
            
            The AI doesn't know this! It must DISCOVER it through learning.
            
            Adjust the hotspot locations and see the thief distribution:
            """)
            
            with gr.Row():
                h1_slider = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1 Location")
                h2_slider = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2 Location")
            
            env_btn = gr.Button("🗺️ Show Thief Distribution", variant="primary")
            env_plot = gr.Plot()
            env_btn.click(show_environment, [h1_slider, h2_slider], env_plot)
        
        # ==== TAB 3: One Episode ====
        with gr.TabItem("3️⃣ Watch One Episode"):
            gr.Markdown("""
            ## 👀 See One Month (30 Days) of Simulation
            
            Watch how AI makes decisions and catches thieves.
            
            (Note: This is untrained AI with 50% exploration rate)
            """)
            
            with gr.Row():
                h1_ep = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
                h2_ep = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")
            
            ep_btn = gr.Button("▶️ Run One Episode", variant="primary")
            ep_plot = gr.Plot()
            ep_summary = gr.Markdown()
            ep_btn.click(simulate_one_episode, [h1_ep, h2_ep], [ep_plot, ep_summary])
        
        # ==== TAB 4: Full Training ====
        with gr.TabItem("4️⃣ Train the AI!"):
            gr.Markdown("""
            ## 🏋️ Train Q-Learning AI
            
            Train the AI and compare it against other strategies!
            
            ⚠️ Training takes a few seconds depending on episodes.
            """)
            
            with gr.Row():
                episodes_slider = gr.Slider(100, 1000, value=300, step=50, 
                                           label="Number of Episodes")
            
            with gr.Row():
                h1_train = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
                h2_train = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")
            
            train_btn = gr.Button("🚀 Train AI!", variant="primary", size="lg")
            
            train_plot = gr.Plot()
            train_results = gr.Markdown()
            
            train_btn.click(train_and_test, 
                           [episodes_slider, h1_train, h2_train],
                           [train_plot, train_results])
        
        # ==== TAB 5: Summary ====
        with gr.TabItem("5️⃣ Key Concepts"):
            gr.Markdown("""
            ## 📚 Summary: Q-Learning Key Concepts
            
            ### 1. Q-Table
            ```
            A "cheat sheet" that stores:
            "In STATE X, if I do ACTION Y, I expect REWARD Z"
            ```
            
            ### 2. State
            ```
            What the AI "sees" at any moment.
            Example: (most_tried_zone, best_zone_so_far)
            ```
            
            ### 3. Action
            ```
            What the AI can do.
            Example: Place sensors in zones (2, 3, 7, 8)
            ```
            
            ### 4. Reward
            ```
            Points for good actions.
            Example: +1 for each thief caught
            ```
            
            ### 5. Epsilon (ε)
            ```
            Exploration rate.
            ε = 1.0 → 100% random (exploring)
            ε = 0.01 → 1% random (exploiting knowledge)
            ```
            
            ### 6. Learning Formula
            ```
            Q(s,a) = Q(s,a) + α × (reward + γ × max(Q(s',a')) - Q(s,a))
            
            In simple terms:
            New Memory = Old Memory + Learning Rate × (Reality - Expectation)
            ```
            
            ---
            
            ## 🎯 Why This Matters
            
            This same technique is used in:
            - 🎮 Game AI (AlphaGo, Chess engines)
            - 🚗 Self-driving cars
            - 🤖 Robots
            - 📱 Recommendation systems
            
            **You just learned how real AI works!** 🎓
            """)
    
    gr.Markdown("""
    ---
    
    ### 🔗 About
    
    This demo shows **Q-Learning Reinforcement Learning** for sensor placement.
    
    The AI learns through trial-and-error, just like humans!
    """)


# Launch
if __name__ == "__main__":
    demo.launch()