Spaces:

Naz786
/

Q-Learning-Sensor-Placement

Sleeping

App Files Files Community

Q-Learning-Sensor-Placement / app.py

Naz786

Create app.py

9c74b9c verified about 2 months ago

raw

history blame contribute delete

27.8 kB

	"""
	Q-Learning AI for Sensor Placement - Interactive Demo
	For Hugging Face Spaces
	"""

	import numpy as np
	import matplotlib.pyplot as plt
	import gradio as gr
	from collections import defaultdict

	np.random.seed(42)


	# ==============================================================================
	# PART 1: THE SECRET WORLD
	# ==============================================================================

	class ThiefWorld:
	"""Where thieves REALLY appear (AI must discover this!)"""

	def __init__(self, hotspot1=2.5, hotspot2=7.0):
	self.hotspot1 = hotspot1
	self.hotspot2 = hotspot2
	self.n_zones = 10

	def get_thief_probability(self, zone):
	zone_center = zone + 0.5
	prob = (
	0.6 * np.exp(-((zone_center - self.hotspot1)**2) / 1.0) +
	0.4 * np.exp(-((zone_center - self.hotspot2)**2) / 0.8) +
	0.05
	)
	return min(prob, 1.0)

	def generate_thieves(self):
	thieves = np.zeros(self.n_zones)
	for zone in range(self.n_zones):
	if np.random.random() < self.get_thief_probability(zone):
	thieves[zone] = 1
	return thieves


	# ==============================================================================
	# PART 2: SENSOR
	# ==============================================================================

	class Sensor:
	def __init__(self, catch_probability=0.9):
	self.catch_prob = catch_probability

	def try_catch(self, thief_present):
	if thief_present:
	return np.random.random() < self.catch_prob
	return False


	# ==============================================================================
	# PART 3: ENVIRONMENT
	# ==============================================================================

	class SensorPlacementEnv:
	def __init__(self, n_sensors=4, hotspot1=2.5, hotspot2=7.0):
	self.world = ThiefWorld(hotspot1, hotspot2)
	self.sensor = Sensor()
	self.n_sensors = n_sensors
	self.n_zones = 10
	self.reset()

	def reset(self):
	self.zone_attempts = np.zeros(self.n_zones)
	self.zone_catches = np.zeros(self.n_zones)
	self.day = 0
	self.total_caught = 0
	self.total_thieves = 0
	return self._get_state()

	def _get_state(self):
	if self.zone_attempts.sum() == 0:
	return (0, 0)
	most_tried = int(np.argmax(self.zone_attempts))
	catch_rates = np.zeros(self.n_zones)
	for z in range(self.n_zones):
	if self.zone_attempts[z] > 0:
	catch_rates[z] = self.zone_catches[z] / self.zone_attempts[z]
	best_zone = int(np.argmax(catch_rates))
	return (most_tried, best_zone)

	def step(self, action):
	thieves = self.world.generate_thieves()
	n_thieves = int(thieves.sum())
	self.total_thieves += n_thieves

	caught = 0
	for zone in action:
	if zone < self.n_zones:
	self.zone_attempts[zone] += 1
	if thieves[zone] == 1:
	if self.sensor.try_catch(True):
	caught += 1
	self.zone_catches[zone] += 1

	self.total_caught += caught
	self.day += 1
	reward = caught + 0.1 * len(set(action))
	done = self.day >= 30

	return self._get_state(), reward, done, {'caught': caught}


	# ==============================================================================
	# PART 4: Q-LEARNING AGENT
	# ==============================================================================

	class QLearningAgent:
	def __init__(self):
	self.q_table = defaultdict(lambda: defaultdict(float))
	self.learning_rate = 0.1
	self.discount_factor = 0.95
	self.epsilon = 1.0
	self.epsilon_decay = 0.995
	self.epsilon_min = 0.01

	def _get_possible_actions(self):
	return [
	(1, 3, 6, 8), (0, 3, 6, 9), (2, 4, 6, 8),
	(0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5),
	(5, 6, 7, 8), (6, 7, 8, 9), (4, 5, 6, 7),
	(2, 3, 7, 8), (1, 2, 6, 7), (2, 3, 6, 7),
	(3, 4, 5, 6), (0, 2, 5, 9), (1, 4, 7, 9),
	]

	def choose_action(self, state):
	actions = self._get_possible_actions()
	if np.random.random() < self.epsilon:
	return actions[np.random.randint(len(actions))]
	else:
	best_action = None
	best_value = -999999
	for action in actions:
	value = self.q_table[state][action]
	if value > best_value:
	best_value = value
	best_action = action
	if best_action is None:
	best_action = actions[np.random.randint(len(actions))]
	return best_action

	def learn(self, state, action, reward, next_state, done):
	old_q = self.q_table[state][action]
	if done:
	max_future_q = 0
	else:
	actions = self._get_possible_actions()
	max_future_q = max([self.q_table[next_state][a] for a in actions])
	target = reward + self.discount_factor * max_future_q
	new_q = old_q + self.learning_rate * (target - old_q)
	self.q_table[state][action] = new_q

	def decay_epsilon(self):
	self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)


	# ==============================================================================
	# TRAINING AND TESTING FUNCTIONS
	# ==============================================================================

	def train_and_test(n_episodes, hotspot1, hotspot2, progress=gr.Progress()):
	"""Train AI and compare with other strategies."""

	np.random.seed(42)

	# Training
	env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
	agent = QLearningAgent()

	episode_rewards = []
	episode_catch_rates = []
	epsilon_history = []

	for episode in progress.tqdm(range(n_episodes), desc="Training AI"):
	state = env.reset()
	total_reward = 0

	for day in range(30):
	action = agent.choose_action(state)
	next_state, reward, done, _ = env.step(action)
	agent.learn(state, action, reward, next_state, done)
	state = next_state
	total_reward += reward
	if done:
	break

	agent.decay_epsilon()
	episode_rewards.append(total_reward)
	catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
	episode_catch_rates.append(catch_rate)
	epsilon_history.append(agent.epsilon)

	# Testing
	n_tests = 50
	results = {}

	# Q-Learning AI
	agent.epsilon = 0
	catches = []
	for _ in range(n_tests):
	state = env.reset()
	for day in range(30):
	action = agent.choose_action(state)
	state, _, done, _ = env.step(action)
	if done:
	break
	catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
	results['Q-Learning AI'] = np.mean(catches)

	# Random
	catches = []
	for _ in range(n_tests):
	env.reset()
	for day in range(30):
	action = tuple(np.random.choice(10, 4, replace=False))
	_, _, done, _ = env.step(action)
	if done:
	break
	catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
	results['Random'] = np.mean(catches)

	# Static
	catches = []
	for _ in range(n_tests):
	env.reset()
	for day in range(30):
	_, _, done, _ = env.step((1, 3, 6, 8))
	if done:
	break
	catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
	results['Static Uniform'] = np.mean(catches)

	# Perfect
	h1_zone = int(hotspot1)
	h2_zone = int(hotspot2)
	perfect_action = (h1_zone, h1_zone+1, h2_zone, h2_zone+1)
	perfect_action = tuple(min(z, 9) for z in perfect_action)
	catches = []
	for _ in range(n_tests):
	env.reset()
	for day in range(30):
	_, _, done, _ = env.step(perfect_action)
	if done:
	break
	catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
	results['Perfect (Cheating)'] = np.mean(catches)

	# Create plots
	fig = plt.figure(figsize=(16, 12))

	# Plot 1: Learning curve
	ax1 = fig.add_subplot(2, 2, 1)
	window = max(10, n_episodes // 20)
	if len(episode_catch_rates) >= window:
	smoothed = np.convolve(episode_catch_rates, np.ones(window)/window, mode='valid')
	ax1.plot(episode_catch_rates, alpha=0.3, color='green', label='Raw')
	ax1.plot(range(window-1, len(episode_catch_rates)), smoothed,
	color='green', linewidth=2, label='Smoothed')
	else:
	ax1.plot(episode_catch_rates, color='green', linewidth=2)
	ax1.set_xlabel('Episode', fontsize=12)
	ax1.set_ylabel('Catch Rate (%)', fontsize=12)
	ax1.set_title('🎓 AI Learning Progress', fontsize=14)
	ax1.legend()
	ax1.grid(True, alpha=0.3)

	# Plot 2: Epsilon decay
	ax2 = fig.add_subplot(2, 2, 2)
	ax2.plot(epsilon_history, color='purple', linewidth=2)
	ax2.set_xlabel('Episode', fontsize=12)
	ax2.set_ylabel('Epsilon (Exploration Rate)', fontsize=12)
	ax2.set_title('🔍 Explore vs Exploit Balance', fontsize=14)
	ax2.grid(True, alpha=0.3)

	# Add annotations
	ax2.annotate('100% Random\n(Exploring)', xy=(0, 1), fontsize=10,
	xytext=(n_episodes*0.1, 0.8), arrowprops=dict(arrowstyle='->', color='gray'))
	ax2.annotate('Mostly Using\nKnowledge', xy=(n_episodes-1, epsilon_history[-1]), fontsize=10,
	xytext=(n_episodes*0.7, 0.3), arrowprops=dict(arrowstyle='->', color='gray'))

	# Plot 3: What AI learned vs Truth
	ax3 = fig.add_subplot(2, 2, 3)

	zone_values = np.zeros(10)
	zone_counts = np.zeros(10)
	for state, actions in agent.q_table.items():
	for action, value in actions.items():
	for zone in action:
	zone_values[zone] += value
	zone_counts[zone] += 1
	zone_counts[zone_counts == 0] = 1
	learned = zone_values / zone_counts

	world = ThiefWorld(hotspot1, hotspot2)
	truth = [world.get_thief_probability(z) for z in range(10)]

	x = np.arange(10)
	width = 0.35
	ax3.bar(x - width/2, learned / max(learned.max(), 0.01), width,
	label='AI Learned', color='blue', alpha=0.7)
	ax3.bar(x + width/2, np.array(truth) / max(truth), width,
	label='True Probability', color='red', alpha=0.7)
	ax3.axvline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1 ({hotspot1})')
	ax3.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2 ({hotspot2})')
	ax3.set_xlabel('Zone', fontsize=12)
	ax3.set_ylabel('Normalized Value', fontsize=12)
	ax3.set_title('🧠 Did AI Learn the Truth?', fontsize=14)
	ax3.legend(loc='upper right')
	ax3.grid(True, alpha=0.3)
	ax3.set_xticks(range(10))

	# Plot 4: Final comparison
	ax4 = fig.add_subplot(2, 2, 4)
	names = list(results.keys())
	values = list(results.values())
	colors = ['green', 'gray', 'orange', 'blue']
	bars = ax4.bar(names, values, color=colors, alpha=0.7, edgecolor='black')

	for bar, val in zip(bars, values):
	ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
	f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')

	ax4.set_ylabel('Catch Rate (%)', fontsize=12)
	ax4.set_title('🏆 Final Comparison', fontsize=14)
	ax4.grid(True, alpha=0.3, axis='y')
	plt.setp(ax4.xaxis.get_majorticklabels(), rotation=15, ha='right')

	plt.tight_layout()

	# Results text
	results_text = f"""
	## 🎯 Training Complete!

	### Training Summary:
	- Episodes trained: {n_episodes}
	- Hotspot 1: Zone {hotspot1}
	- Hotspot 2: Zone {hotspot2}
	- Final exploration rate: *{epsilon_history[-1]100:.1f}%**

	### 📊 Test Results (50 test runs each):

	\| Strategy \| Catch Rate \|
	\|----------\|------------\|
	\| 🏆 Q-Learning AI \| {results['Q-Learning AI']:.1f}% \|
	\| Random \| {results['Random']:.1f}% \|
	\| Static Uniform \| {results['Static Uniform']:.1f}% \|
	\| Perfect (Cheating) \| {results['Perfect (Cheating)']:.1f}% \|

	### 🧠 What AI Learned:
	The AI discovered that zones {int(hotspot1)} and {int(hotspot2)} have more thieves!

	### 🎓 Key Insight:
	AI started knowing NOTHING and learned through trial and error!
	"""

	return fig, results_text


	def explain_qlearning():
	"""Create explanation visualization."""

	fig, axes = plt.subplots(1, 2, figsize=(14, 5))

	# Plot 1: Q-Learning cycle
	ax1 = axes[0]
	ax1.axis('off')

	# Draw cycle
	cycle_text = """
	┌─────────────────────────────────────────────────────────────┐
	│ Q-LEARNING CYCLE │
	│ │
	│ ┌─────────┐ │
	│ │ STATE │ │
	│ │(What AI │ │
	│ │ sees) │ │
	│ └────┬────┘ │
	│ │ │
	│ ▼ │
	│ ┌──────────┐ ┌─────────┐ ┌──────────┐ │
	│ │ UPDATE │◄────│ ACTION │────►│ REWARD │ │
	│ │ Q-TABLE │ │(Place │ │(Caught │ │
	│ │(Remember)│ │sensors) │ │thieves?) │ │
	│ └──────────┘ └─────────┘ └──────────┘ │
	│ │ │ │
	│ └─────────────────────────────────┘ │
	│ REPEAT! │
	└─────────────────────────────────────────────────────────────┘
	"""
	ax1.text(0.5, 0.5, cycle_text, transform=ax1.transAxes, fontsize=10,
	verticalalignment='center', horizontalalignment='center',
	fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='lightyellow'))
	ax1.set_title('How Q-Learning Works', fontsize=14)

	# Plot 2: Epsilon explanation
	ax2 = axes[1]
	episodes = np.arange(500)
	epsilon = 1.0 * (0.995 ** episodes)
	epsilon = np.maximum(epsilon, 0.01)

	ax2.fill_between(episodes, epsilon, alpha=0.3, color='blue', label='EXPLORE')
	ax2.fill_between(episodes, 0, 1-epsilon, alpha=0.3, color='green', label='EXPLOIT')
	ax2.plot(episodes, epsilon, 'b-', linewidth=2)
	ax2.plot(episodes, 1-epsilon, 'g-', linewidth=2)

	ax2.axvline(50, color='gray', linestyle='--', alpha=0.5)
	ax2.axvline(200, color='gray', linestyle='--', alpha=0.5)
	ax2.axvline(400, color='gray', linestyle='--', alpha=0.5)

	ax2.text(25, 0.5, 'Early:\n80% Explore', fontsize=9, ha='center')
	ax2.text(125, 0.5, 'Middle:\n50-50', fontsize=9, ha='center')
	ax2.text(300, 0.5, 'Late:\n80% Exploit', fontsize=9, ha='center')

	ax2.set_xlabel('Episode', fontsize=12)
	ax2.set_ylabel('Probability', fontsize=12)
	ax2.set_title('Explore vs Exploit Over Time', fontsize=14)
	ax2.legend(loc='center right')
	ax2.grid(True, alpha=0.3)

	plt.tight_layout()
	return fig


	def show_environment(hotspot1, hotspot2):
	"""Visualize the thief world."""

	fig, ax = plt.subplots(figsize=(12, 5))

	world = ThiefWorld(hotspot1, hotspot2)
	zones = np.arange(10)
	probs = [world.get_thief_probability(z) for z in zones]

	colors = ['red' if p > 0.4 else 'orange' if p > 0.2 else 'green' for p in probs]
	bars = ax.bar(zones, probs, color=colors, alpha=0.7, edgecolor='black')

	for bar, prob in zip(bars, probs):
	ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
	f'{prob*100:.0f}%', ha='center', fontsize=10, fontweight='bold')

	ax.axvline(hotspot1, color='red', linestyle='--', linewidth=2, label=f'Hotspot 1 ({hotspot1})')
	ax.axvline(hotspot2, color='darkred', linestyle='--', linewidth=2, label=f'Hotspot 2 ({hotspot2})')

	ax.set_xlabel('Zone', fontsize=12)
	ax.set_ylabel('Thief Probability', fontsize=12)
	ax.set_title('🦹 Secret Thief Locations (AI Must Discover This!)', fontsize=14)
	ax.set_xticks(zones)
	ax.legend()
	ax.grid(True, alpha=0.3, axis='y')

	plt.tight_layout()
	return fig


	def simulate_one_episode(hotspot1, hotspot2):
	"""Simulate and visualize one episode."""

	np.random.seed(None) # Random seed for variety

	env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
	agent = QLearningAgent()
	agent.epsilon = 0.5 # 50% explore for demo

	state = env.reset()

	# Track daily data
	daily_actions = []
	daily_caught = []
	daily_thieves = []

	for day in range(30):
	action = agent.choose_action(state)
	daily_actions.append(action)

	old_caught = env.total_caught
	old_thieves = env.total_thieves

	state, reward, done, info = env.step(action)

	daily_caught.append(env.total_caught - old_caught)
	daily_thieves.append(env.total_thieves - old_thieves)

	agent.learn(state, action, reward, state, done)

	# Create visualization
	fig, axes = plt.subplots(2, 2, figsize=(14, 10))

	# Plot 1: Sensor placements over days
	ax1 = axes[0, 0]
	for day, action in enumerate(daily_actions):
	for zone in action:
	ax1.scatter(day, zone, c='blue', s=30, alpha=0.6)

	ax1.axhline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1')
	ax1.axhline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2')
	ax1.set_xlabel('Day', fontsize=12)
	ax1.set_ylabel('Zone', fontsize=12)
	ax1.set_title('📍 Where AI Placed Sensors Each Day', fontsize=14)
	ax1.legend()
	ax1.grid(True, alpha=0.3)
	ax1.set_yticks(range(10))

	# Plot 2: Daily catches
	ax2 = axes[0, 1]
	days = range(1, 31)
	ax2.bar(days, daily_caught, color='green', alpha=0.7, label='Caught')
	ax2.plot(days, daily_thieves, 'ro-', markersize=5, label='Total Thieves')
	ax2.set_xlabel('Day', fontsize=12)
	ax2.set_ylabel('Count', fontsize=12)
	ax2.set_title('🎯 Daily Catches', fontsize=14)
	ax2.legend()
	ax2.grid(True, alpha=0.3)

	# Plot 3: Cumulative performance
	ax3 = axes[1, 0]
	cum_caught = np.cumsum(daily_caught)
	cum_thieves = np.cumsum(daily_thieves)
	ax3.fill_between(days, cum_caught, alpha=0.3, color='green')
	ax3.plot(days, cum_caught, 'g-', linewidth=2, label='Cumulative Caught')
	ax3.plot(days, cum_thieves, 'r--', linewidth=2, label='Cumulative Thieves')
	ax3.set_xlabel('Day', fontsize=12)
	ax3.set_ylabel('Cumulative Count', fontsize=12)
	ax3.set_title('📈 Cumulative Performance', fontsize=14)
	ax3.legend()
	ax3.grid(True, alpha=0.3)

	# Plot 4: Zone usage
	ax4 = axes[1, 1]
	zone_usage = np.zeros(10)
	for action in daily_actions:
	for zone in action:
	zone_usage[zone] += 1

	colors = ['blue' if z in [int(hotspot1), int(hotspot1)+1, int(hotspot2), int(hotspot2)+1]
	else 'gray' for z in range(10)]
	ax4.bar(range(10), zone_usage, color=colors, alpha=0.7, edgecolor='black')
	ax4.axvline(hotspot1, color='red', linestyle='--', alpha=0.5)
	ax4.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5)
	ax4.set_xlabel('Zone', fontsize=12)
	ax4.set_ylabel('Times Used', fontsize=12)
	ax4.set_title('🗺️ Zone Usage (Blue = Near Hotspots)', fontsize=14)
	ax4.set_xticks(range(10))
	ax4.grid(True, alpha=0.3, axis='y')

	plt.tight_layout()

	# Summary
	catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
	summary = f"""
	## 📊 Episode Summary

	- Total Thieves: {env.total_thieves}
	- Total Caught: {env.total_caught}
	- Catch Rate: {catch_rate:.1f}%

	### Zones Most Used:
	{', '.join([f'Zone {i}' for i in np.argsort(zone_usage)[-3:][::-1]])}

	### Note:
	This is just ONE episode with 50% exploration.
	Train for 500+ episodes to see real learning!
	"""

	return fig, summary


	# ==============================================================================
	# GRADIO INTERFACE
	# ==============================================================================

	with gr.Blocks(title="Q-Learning AI Demo", theme=gr.themes.Soft()) as demo:

	gr.Markdown("""
	# 🤖 Q-Learning AI for Sensor Placement

	Watch an AI learn where to place sensors to catch thieves!

	The AI starts knowing NOTHING and learns through trial-and-error.

	---
	""")

	with gr.Tabs():

	# ==== TAB 1: Explanation ====
	with gr.TabItem("1️⃣ What is Q-Learning?"):
	gr.Markdown("""
	## 🎓 Q-Learning Explained Simply

	### Like Teaching a Dog:
	```
	1. Dog tries something → 2. Gets treat (or not) → 3. Remembers → 4. Gets smarter!
	```

	### For Our AI:
	```
	1. AI places sensors → 2. Catches thieves (reward!) → 3. Updates Q-Table → 4. Gets smarter!
	```

	### The Q-Table (AI's Memory):

	\| State \| Action \| Expected Reward \|
	\|-------\|--------\|-----------------\|
	\| "Day 1" \| Zones (1,3,6,8) \| 1.5 points \|
	\| "Day 1" \| Zones (2,3,7,8) \| 3.2 points ← Better! \|

	### Explore vs Exploit:
	- EXPLORE: Try random things to learn
	- EXPLOIT: Use what you already know

	Early training → More EXPLORE
	Late training → More EXPLOIT
	""")

	explain_btn = gr.Button("📊 Show Visual Explanation", variant="primary")
	explain_plot = gr.Plot()
	explain_btn.click(explain_qlearning, outputs=explain_plot)

	# ==== TAB 2: Environment ====
	with gr.TabItem("2️⃣ The Secret World"):
	gr.Markdown("""
	## 🦹 Where Do Thieves Appear?

	The AI doesn't know this! It must DISCOVER it through learning.

	Adjust the hotspot locations and see the thief distribution:
	""")

	with gr.Row():
	h1_slider = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1 Location")
	h2_slider = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2 Location")

	env_btn = gr.Button("🗺️ Show Thief Distribution", variant="primary")
	env_plot = gr.Plot()
	env_btn.click(show_environment, [h1_slider, h2_slider], env_plot)

	# ==== TAB 3: One Episode ====
	with gr.TabItem("3️⃣ Watch One Episode"):
	gr.Markdown("""
	## 👀 See One Month (30 Days) of Simulation

	Watch how AI makes decisions and catches thieves.

	(Note: This is untrained AI with 50% exploration rate)
	""")

	with gr.Row():
	h1_ep = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
	h2_ep = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")

	ep_btn = gr.Button("▶️ Run One Episode", variant="primary")
	ep_plot = gr.Plot()
	ep_summary = gr.Markdown()
	ep_btn.click(simulate_one_episode, [h1_ep, h2_ep], [ep_plot, ep_summary])

	# ==== TAB 4: Full Training ====
	with gr.TabItem("4️⃣ Train the AI!"):
	gr.Markdown("""
	## 🏋️ Train Q-Learning AI

	Train the AI and compare it against other strategies!

	⚠️ Training takes a few seconds depending on episodes.
	""")

	with gr.Row():
	episodes_slider = gr.Slider(100, 1000, value=300, step=50,
	label="Number of Episodes")

	with gr.Row():
	h1_train = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
	h2_train = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")

	train_btn = gr.Button("🚀 Train AI!", variant="primary", size="lg")

	train_plot = gr.Plot()
	train_results = gr.Markdown()

	train_btn.click(train_and_test,
	[episodes_slider, h1_train, h2_train],
	[train_plot, train_results])

	# ==== TAB 5: Summary ====
	with gr.TabItem("5️⃣ Key Concepts"):
	gr.Markdown("""
	## 📚 Summary: Q-Learning Key Concepts

	### 1. Q-Table
	```
	A "cheat sheet" that stores:
	"In STATE X, if I do ACTION Y, I expect REWARD Z"
	```

	### 2. State
	```
	What the AI "sees" at any moment.
	Example: (most_tried_zone, best_zone_so_far)
	```

	### 3. Action
	```
	What the AI can do.
	Example: Place sensors in zones (2, 3, 7, 8)
	```

	### 4. Reward
	```
	Points for good actions.
	Example: +1 for each thief caught
	```

	### 5. Epsilon (ε)
	```
	Exploration rate.
	ε = 1.0 → 100% random (exploring)
	ε = 0.01 → 1% random (exploiting knowledge)
	```

	### 6. Learning Formula
	```
	Q(s,a) = Q(s,a) + α × (reward + γ × max(Q(s',a')) - Q(s,a))

	In simple terms:
	New Memory = Old Memory + Learning Rate × (Reality - Expectation)
	```

	---

	## 🎯 Why This Matters

	This same technique is used in:
	- 🎮 Game AI (AlphaGo, Chess engines)
	- 🚗 Self-driving cars
	- 🤖 Robots
	- 📱 Recommendation systems

	You just learned how real AI works! 🎓
	""")

	gr.Markdown("""
	---

	### 🔗 About

	This demo shows Q-Learning Reinforcement Learning for sensor placement.

	The AI learns through trial-and-error, just like humans!
	""")


	# Launch
	if __name__ == "__main__":
	demo.launch()