Naz786's picture
Create app.py
9c74b9c verified
"""
Q-Learning AI for Sensor Placement - Interactive Demo
For Hugging Face Spaces
"""
import numpy as np
import matplotlib.pyplot as plt
import gradio as gr
from collections import defaultdict
np.random.seed(42)
# ==============================================================================
# PART 1: THE SECRET WORLD
# ==============================================================================
class ThiefWorld:
"""Where thieves REALLY appear (AI must discover this!)"""
def __init__(self, hotspot1=2.5, hotspot2=7.0):
self.hotspot1 = hotspot1
self.hotspot2 = hotspot2
self.n_zones = 10
def get_thief_probability(self, zone):
zone_center = zone + 0.5
prob = (
0.6 * np.exp(-((zone_center - self.hotspot1)**2) / 1.0) +
0.4 * np.exp(-((zone_center - self.hotspot2)**2) / 0.8) +
0.05
)
return min(prob, 1.0)
def generate_thieves(self):
thieves = np.zeros(self.n_zones)
for zone in range(self.n_zones):
if np.random.random() < self.get_thief_probability(zone):
thieves[zone] = 1
return thieves
# ==============================================================================
# PART 2: SENSOR
# ==============================================================================
class Sensor:
def __init__(self, catch_probability=0.9):
self.catch_prob = catch_probability
def try_catch(self, thief_present):
if thief_present:
return np.random.random() < self.catch_prob
return False
# ==============================================================================
# PART 3: ENVIRONMENT
# ==============================================================================
class SensorPlacementEnv:
def __init__(self, n_sensors=4, hotspot1=2.5, hotspot2=7.0):
self.world = ThiefWorld(hotspot1, hotspot2)
self.sensor = Sensor()
self.n_sensors = n_sensors
self.n_zones = 10
self.reset()
def reset(self):
self.zone_attempts = np.zeros(self.n_zones)
self.zone_catches = np.zeros(self.n_zones)
self.day = 0
self.total_caught = 0
self.total_thieves = 0
return self._get_state()
def _get_state(self):
if self.zone_attempts.sum() == 0:
return (0, 0)
most_tried = int(np.argmax(self.zone_attempts))
catch_rates = np.zeros(self.n_zones)
for z in range(self.n_zones):
if self.zone_attempts[z] > 0:
catch_rates[z] = self.zone_catches[z] / self.zone_attempts[z]
best_zone = int(np.argmax(catch_rates))
return (most_tried, best_zone)
def step(self, action):
thieves = self.world.generate_thieves()
n_thieves = int(thieves.sum())
self.total_thieves += n_thieves
caught = 0
for zone in action:
if zone < self.n_zones:
self.zone_attempts[zone] += 1
if thieves[zone] == 1:
if self.sensor.try_catch(True):
caught += 1
self.zone_catches[zone] += 1
self.total_caught += caught
self.day += 1
reward = caught + 0.1 * len(set(action))
done = self.day >= 30
return self._get_state(), reward, done, {'caught': caught}
# ==============================================================================
# PART 4: Q-LEARNING AGENT
# ==============================================================================
class QLearningAgent:
def __init__(self):
self.q_table = defaultdict(lambda: defaultdict(float))
self.learning_rate = 0.1
self.discount_factor = 0.95
self.epsilon = 1.0
self.epsilon_decay = 0.995
self.epsilon_min = 0.01
def _get_possible_actions(self):
return [
(1, 3, 6, 8), (0, 3, 6, 9), (2, 4, 6, 8),
(0, 1, 2, 3), (1, 2, 3, 4), (2, 3, 4, 5),
(5, 6, 7, 8), (6, 7, 8, 9), (4, 5, 6, 7),
(2, 3, 7, 8), (1, 2, 6, 7), (2, 3, 6, 7),
(3, 4, 5, 6), (0, 2, 5, 9), (1, 4, 7, 9),
]
def choose_action(self, state):
actions = self._get_possible_actions()
if np.random.random() < self.epsilon:
return actions[np.random.randint(len(actions))]
else:
best_action = None
best_value = -999999
for action in actions:
value = self.q_table[state][action]
if value > best_value:
best_value = value
best_action = action
if best_action is None:
best_action = actions[np.random.randint(len(actions))]
return best_action
def learn(self, state, action, reward, next_state, done):
old_q = self.q_table[state][action]
if done:
max_future_q = 0
else:
actions = self._get_possible_actions()
max_future_q = max([self.q_table[next_state][a] for a in actions])
target = reward + self.discount_factor * max_future_q
new_q = old_q + self.learning_rate * (target - old_q)
self.q_table[state][action] = new_q
def decay_epsilon(self):
self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
# ==============================================================================
# TRAINING AND TESTING FUNCTIONS
# ==============================================================================
def train_and_test(n_episodes, hotspot1, hotspot2, progress=gr.Progress()):
"""Train AI and compare with other strategies."""
np.random.seed(42)
# Training
env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
agent = QLearningAgent()
episode_rewards = []
episode_catch_rates = []
epsilon_history = []
for episode in progress.tqdm(range(n_episodes), desc="Training AI"):
state = env.reset()
total_reward = 0
for day in range(30):
action = agent.choose_action(state)
next_state, reward, done, _ = env.step(action)
agent.learn(state, action, reward, next_state, done)
state = next_state
total_reward += reward
if done:
break
agent.decay_epsilon()
episode_rewards.append(total_reward)
catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
episode_catch_rates.append(catch_rate)
epsilon_history.append(agent.epsilon)
# Testing
n_tests = 50
results = {}
# Q-Learning AI
agent.epsilon = 0
catches = []
for _ in range(n_tests):
state = env.reset()
for day in range(30):
action = agent.choose_action(state)
state, _, done, _ = env.step(action)
if done:
break
catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
results['Q-Learning AI'] = np.mean(catches)
# Random
catches = []
for _ in range(n_tests):
env.reset()
for day in range(30):
action = tuple(np.random.choice(10, 4, replace=False))
_, _, done, _ = env.step(action)
if done:
break
catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
results['Random'] = np.mean(catches)
# Static
catches = []
for _ in range(n_tests):
env.reset()
for day in range(30):
_, _, done, _ = env.step((1, 3, 6, 8))
if done:
break
catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
results['Static Uniform'] = np.mean(catches)
# Perfect
h1_zone = int(hotspot1)
h2_zone = int(hotspot2)
perfect_action = (h1_zone, h1_zone+1, h2_zone, h2_zone+1)
perfect_action = tuple(min(z, 9) for z in perfect_action)
catches = []
for _ in range(n_tests):
env.reset()
for day in range(30):
_, _, done, _ = env.step(perfect_action)
if done:
break
catches.append(env.total_caught / max(env.total_thieves, 1) * 100)
results['Perfect (Cheating)'] = np.mean(catches)
# Create plots
fig = plt.figure(figsize=(16, 12))
# Plot 1: Learning curve
ax1 = fig.add_subplot(2, 2, 1)
window = max(10, n_episodes // 20)
if len(episode_catch_rates) >= window:
smoothed = np.convolve(episode_catch_rates, np.ones(window)/window, mode='valid')
ax1.plot(episode_catch_rates, alpha=0.3, color='green', label='Raw')
ax1.plot(range(window-1, len(episode_catch_rates)), smoothed,
color='green', linewidth=2, label='Smoothed')
else:
ax1.plot(episode_catch_rates, color='green', linewidth=2)
ax1.set_xlabel('Episode', fontsize=12)
ax1.set_ylabel('Catch Rate (%)', fontsize=12)
ax1.set_title('๐ŸŽ“ AI Learning Progress', fontsize=14)
ax1.legend()
ax1.grid(True, alpha=0.3)
# Plot 2: Epsilon decay
ax2 = fig.add_subplot(2, 2, 2)
ax2.plot(epsilon_history, color='purple', linewidth=2)
ax2.set_xlabel('Episode', fontsize=12)
ax2.set_ylabel('Epsilon (Exploration Rate)', fontsize=12)
ax2.set_title('๐Ÿ” Explore vs Exploit Balance', fontsize=14)
ax2.grid(True, alpha=0.3)
# Add annotations
ax2.annotate('100% Random\n(Exploring)', xy=(0, 1), fontsize=10,
xytext=(n_episodes*0.1, 0.8), arrowprops=dict(arrowstyle='->', color='gray'))
ax2.annotate('Mostly Using\nKnowledge', xy=(n_episodes-1, epsilon_history[-1]), fontsize=10,
xytext=(n_episodes*0.7, 0.3), arrowprops=dict(arrowstyle='->', color='gray'))
# Plot 3: What AI learned vs Truth
ax3 = fig.add_subplot(2, 2, 3)
zone_values = np.zeros(10)
zone_counts = np.zeros(10)
for state, actions in agent.q_table.items():
for action, value in actions.items():
for zone in action:
zone_values[zone] += value
zone_counts[zone] += 1
zone_counts[zone_counts == 0] = 1
learned = zone_values / zone_counts
world = ThiefWorld(hotspot1, hotspot2)
truth = [world.get_thief_probability(z) for z in range(10)]
x = np.arange(10)
width = 0.35
ax3.bar(x - width/2, learned / max(learned.max(), 0.01), width,
label='AI Learned', color='blue', alpha=0.7)
ax3.bar(x + width/2, np.array(truth) / max(truth), width,
label='True Probability', color='red', alpha=0.7)
ax3.axvline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1 ({hotspot1})')
ax3.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2 ({hotspot2})')
ax3.set_xlabel('Zone', fontsize=12)
ax3.set_ylabel('Normalized Value', fontsize=12)
ax3.set_title('๐Ÿง  Did AI Learn the Truth?', fontsize=14)
ax3.legend(loc='upper right')
ax3.grid(True, alpha=0.3)
ax3.set_xticks(range(10))
# Plot 4: Final comparison
ax4 = fig.add_subplot(2, 2, 4)
names = list(results.keys())
values = list(results.values())
colors = ['green', 'gray', 'orange', 'blue']
bars = ax4.bar(names, values, color=colors, alpha=0.7, edgecolor='black')
for bar, val in zip(bars, values):
ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')
ax4.set_ylabel('Catch Rate (%)', fontsize=12)
ax4.set_title('๐Ÿ† Final Comparison', fontsize=14)
ax4.grid(True, alpha=0.3, axis='y')
plt.setp(ax4.xaxis.get_majorticklabels(), rotation=15, ha='right')
plt.tight_layout()
# Results text
results_text = f"""
## ๐ŸŽฏ Training Complete!
### Training Summary:
- Episodes trained: **{n_episodes}**
- Hotspot 1: Zone **{hotspot1}**
- Hotspot 2: Zone **{hotspot2}**
- Final exploration rate: **{epsilon_history[-1]*100:.1f}%**
### ๐Ÿ“Š Test Results (50 test runs each):
| Strategy | Catch Rate |
|----------|------------|
| ๐Ÿ† **Q-Learning AI** | **{results['Q-Learning AI']:.1f}%** |
| Random | {results['Random']:.1f}% |
| Static Uniform | {results['Static Uniform']:.1f}% |
| Perfect (Cheating) | {results['Perfect (Cheating)']:.1f}% |
### ๐Ÿง  What AI Learned:
The AI discovered that zones **{int(hotspot1)}** and **{int(hotspot2)}** have more thieves!
### ๐ŸŽ“ Key Insight:
AI started knowing **NOTHING** and learned through **trial and error**!
"""
return fig, results_text
def explain_qlearning():
"""Create explanation visualization."""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
# Plot 1: Q-Learning cycle
ax1 = axes[0]
ax1.axis('off')
# Draw cycle
cycle_text = """
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Q-LEARNING CYCLE โ”‚
โ”‚ โ”‚
โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚
โ”‚ โ”‚ STATE โ”‚ โ”‚
โ”‚ โ”‚(What AI โ”‚ โ”‚
โ”‚ โ”‚ sees) โ”‚ โ”‚
โ”‚ โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”˜ โ”‚
โ”‚ โ”‚ โ”‚
โ”‚ โ–ผ โ”‚
โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚
โ”‚ โ”‚ UPDATE โ”‚โ—„โ”€โ”€โ”€โ”€โ”‚ ACTION โ”‚โ”€โ”€โ”€โ”€โ–บโ”‚ REWARD โ”‚ โ”‚
โ”‚ โ”‚ Q-TABLE โ”‚ โ”‚(Place โ”‚ โ”‚(Caught โ”‚ โ”‚
โ”‚ โ”‚(Remember)โ”‚ โ”‚sensors) โ”‚ โ”‚thieves?) โ”‚ โ”‚
โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
โ”‚ โ”‚ โ”‚ โ”‚
โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚
โ”‚ REPEAT! โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
"""
ax1.text(0.5, 0.5, cycle_text, transform=ax1.transAxes, fontsize=10,
verticalalignment='center', horizontalalignment='center',
fontfamily='monospace', bbox=dict(boxstyle='round', facecolor='lightyellow'))
ax1.set_title('How Q-Learning Works', fontsize=14)
# Plot 2: Epsilon explanation
ax2 = axes[1]
episodes = np.arange(500)
epsilon = 1.0 * (0.995 ** episodes)
epsilon = np.maximum(epsilon, 0.01)
ax2.fill_between(episodes, epsilon, alpha=0.3, color='blue', label='EXPLORE')
ax2.fill_between(episodes, 0, 1-epsilon, alpha=0.3, color='green', label='EXPLOIT')
ax2.plot(episodes, epsilon, 'b-', linewidth=2)
ax2.plot(episodes, 1-epsilon, 'g-', linewidth=2)
ax2.axvline(50, color='gray', linestyle='--', alpha=0.5)
ax2.axvline(200, color='gray', linestyle='--', alpha=0.5)
ax2.axvline(400, color='gray', linestyle='--', alpha=0.5)
ax2.text(25, 0.5, 'Early:\n80% Explore', fontsize=9, ha='center')
ax2.text(125, 0.5, 'Middle:\n50-50', fontsize=9, ha='center')
ax2.text(300, 0.5, 'Late:\n80% Exploit', fontsize=9, ha='center')
ax2.set_xlabel('Episode', fontsize=12)
ax2.set_ylabel('Probability', fontsize=12)
ax2.set_title('Explore vs Exploit Over Time', fontsize=14)
ax2.legend(loc='center right')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
return fig
def show_environment(hotspot1, hotspot2):
"""Visualize the thief world."""
fig, ax = plt.subplots(figsize=(12, 5))
world = ThiefWorld(hotspot1, hotspot2)
zones = np.arange(10)
probs = [world.get_thief_probability(z) for z in zones]
colors = ['red' if p > 0.4 else 'orange' if p > 0.2 else 'green' for p in probs]
bars = ax.bar(zones, probs, color=colors, alpha=0.7, edgecolor='black')
for bar, prob in zip(bars, probs):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
f'{prob*100:.0f}%', ha='center', fontsize=10, fontweight='bold')
ax.axvline(hotspot1, color='red', linestyle='--', linewidth=2, label=f'Hotspot 1 ({hotspot1})')
ax.axvline(hotspot2, color='darkred', linestyle='--', linewidth=2, label=f'Hotspot 2 ({hotspot2})')
ax.set_xlabel('Zone', fontsize=12)
ax.set_ylabel('Thief Probability', fontsize=12)
ax.set_title('๐Ÿฆน Secret Thief Locations (AI Must Discover This!)', fontsize=14)
ax.set_xticks(zones)
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
return fig
def simulate_one_episode(hotspot1, hotspot2):
"""Simulate and visualize one episode."""
np.random.seed(None) # Random seed for variety
env = SensorPlacementEnv(hotspot1=hotspot1, hotspot2=hotspot2)
agent = QLearningAgent()
agent.epsilon = 0.5 # 50% explore for demo
state = env.reset()
# Track daily data
daily_actions = []
daily_caught = []
daily_thieves = []
for day in range(30):
action = agent.choose_action(state)
daily_actions.append(action)
old_caught = env.total_caught
old_thieves = env.total_thieves
state, reward, done, info = env.step(action)
daily_caught.append(env.total_caught - old_caught)
daily_thieves.append(env.total_thieves - old_thieves)
agent.learn(state, action, reward, state, done)
# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# Plot 1: Sensor placements over days
ax1 = axes[0, 0]
for day, action in enumerate(daily_actions):
for zone in action:
ax1.scatter(day, zone, c='blue', s=30, alpha=0.6)
ax1.axhline(hotspot1, color='red', linestyle='--', alpha=0.5, label=f'Hotspot 1')
ax1.axhline(hotspot2, color='darkred', linestyle='--', alpha=0.5, label=f'Hotspot 2')
ax1.set_xlabel('Day', fontsize=12)
ax1.set_ylabel('Zone', fontsize=12)
ax1.set_title('๐Ÿ“ Where AI Placed Sensors Each Day', fontsize=14)
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_yticks(range(10))
# Plot 2: Daily catches
ax2 = axes[0, 1]
days = range(1, 31)
ax2.bar(days, daily_caught, color='green', alpha=0.7, label='Caught')
ax2.plot(days, daily_thieves, 'ro-', markersize=5, label='Total Thieves')
ax2.set_xlabel('Day', fontsize=12)
ax2.set_ylabel('Count', fontsize=12)
ax2.set_title('๐ŸŽฏ Daily Catches', fontsize=14)
ax2.legend()
ax2.grid(True, alpha=0.3)
# Plot 3: Cumulative performance
ax3 = axes[1, 0]
cum_caught = np.cumsum(daily_caught)
cum_thieves = np.cumsum(daily_thieves)
ax3.fill_between(days, cum_caught, alpha=0.3, color='green')
ax3.plot(days, cum_caught, 'g-', linewidth=2, label='Cumulative Caught')
ax3.plot(days, cum_thieves, 'r--', linewidth=2, label='Cumulative Thieves')
ax3.set_xlabel('Day', fontsize=12)
ax3.set_ylabel('Cumulative Count', fontsize=12)
ax3.set_title('๐Ÿ“ˆ Cumulative Performance', fontsize=14)
ax3.legend()
ax3.grid(True, alpha=0.3)
# Plot 4: Zone usage
ax4 = axes[1, 1]
zone_usage = np.zeros(10)
for action in daily_actions:
for zone in action:
zone_usage[zone] += 1
colors = ['blue' if z in [int(hotspot1), int(hotspot1)+1, int(hotspot2), int(hotspot2)+1]
else 'gray' for z in range(10)]
ax4.bar(range(10), zone_usage, color=colors, alpha=0.7, edgecolor='black')
ax4.axvline(hotspot1, color='red', linestyle='--', alpha=0.5)
ax4.axvline(hotspot2, color='darkred', linestyle='--', alpha=0.5)
ax4.set_xlabel('Zone', fontsize=12)
ax4.set_ylabel('Times Used', fontsize=12)
ax4.set_title('๐Ÿ—บ๏ธ Zone Usage (Blue = Near Hotspots)', fontsize=14)
ax4.set_xticks(range(10))
ax4.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
# Summary
catch_rate = env.total_caught / max(env.total_thieves, 1) * 100
summary = f"""
## ๐Ÿ“Š Episode Summary
- **Total Thieves:** {env.total_thieves}
- **Total Caught:** {env.total_caught}
- **Catch Rate:** {catch_rate:.1f}%
### Zones Most Used:
{', '.join([f'Zone {i}' for i in np.argsort(zone_usage)[-3:][::-1]])}
### Note:
This is just ONE episode with 50% exploration.
Train for 500+ episodes to see real learning!
"""
return fig, summary
# ==============================================================================
# GRADIO INTERFACE
# ==============================================================================
with gr.Blocks(title="Q-Learning AI Demo", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# ๐Ÿค– Q-Learning AI for Sensor Placement
**Watch an AI learn where to place sensors to catch thieves!**
The AI starts knowing NOTHING and learns through trial-and-error.
---
""")
with gr.Tabs():
# ==== TAB 1: Explanation ====
with gr.TabItem("1๏ธโƒฃ What is Q-Learning?"):
gr.Markdown("""
## ๐ŸŽ“ Q-Learning Explained Simply
### Like Teaching a Dog:
```
1. Dog tries something โ†’ 2. Gets treat (or not) โ†’ 3. Remembers โ†’ 4. Gets smarter!
```
### For Our AI:
```
1. AI places sensors โ†’ 2. Catches thieves (reward!) โ†’ 3. Updates Q-Table โ†’ 4. Gets smarter!
```
### The Q-Table (AI's Memory):
| State | Action | Expected Reward |
|-------|--------|-----------------|
| "Day 1" | Zones (1,3,6,8) | 1.5 points |
| "Day 1" | Zones (2,3,7,8) | 3.2 points โ† Better! |
### Explore vs Exploit:
- **EXPLORE**: Try random things to learn
- **EXPLOIT**: Use what you already know
Early training โ†’ More EXPLORE
Late training โ†’ More EXPLOIT
""")
explain_btn = gr.Button("๐Ÿ“Š Show Visual Explanation", variant="primary")
explain_plot = gr.Plot()
explain_btn.click(explain_qlearning, outputs=explain_plot)
# ==== TAB 2: Environment ====
with gr.TabItem("2๏ธโƒฃ The Secret World"):
gr.Markdown("""
## ๐Ÿฆน Where Do Thieves Appear?
The AI doesn't know this! It must DISCOVER it through learning.
Adjust the hotspot locations and see the thief distribution:
""")
with gr.Row():
h1_slider = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1 Location")
h2_slider = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2 Location")
env_btn = gr.Button("๐Ÿ—บ๏ธ Show Thief Distribution", variant="primary")
env_plot = gr.Plot()
env_btn.click(show_environment, [h1_slider, h2_slider], env_plot)
# ==== TAB 3: One Episode ====
with gr.TabItem("3๏ธโƒฃ Watch One Episode"):
gr.Markdown("""
## ๐Ÿ‘€ See One Month (30 Days) of Simulation
Watch how AI makes decisions and catches thieves.
(Note: This is untrained AI with 50% exploration rate)
""")
with gr.Row():
h1_ep = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
h2_ep = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")
ep_btn = gr.Button("โ–ถ๏ธ Run One Episode", variant="primary")
ep_plot = gr.Plot()
ep_summary = gr.Markdown()
ep_btn.click(simulate_one_episode, [h1_ep, h2_ep], [ep_plot, ep_summary])
# ==== TAB 4: Full Training ====
with gr.TabItem("4๏ธโƒฃ Train the AI!"):
gr.Markdown("""
## ๐Ÿ‹๏ธ Train Q-Learning AI
Train the AI and compare it against other strategies!
โš ๏ธ Training takes a few seconds depending on episodes.
""")
with gr.Row():
episodes_slider = gr.Slider(100, 1000, value=300, step=50,
label="Number of Episodes")
with gr.Row():
h1_train = gr.Slider(0, 9, value=2.5, step=0.5, label="Hotspot 1")
h2_train = gr.Slider(0, 9, value=7.0, step=0.5, label="Hotspot 2")
train_btn = gr.Button("๐Ÿš€ Train AI!", variant="primary", size="lg")
train_plot = gr.Plot()
train_results = gr.Markdown()
train_btn.click(train_and_test,
[episodes_slider, h1_train, h2_train],
[train_plot, train_results])
# ==== TAB 5: Summary ====
with gr.TabItem("5๏ธโƒฃ Key Concepts"):
gr.Markdown("""
## ๐Ÿ“š Summary: Q-Learning Key Concepts
### 1. Q-Table
```
A "cheat sheet" that stores:
"In STATE X, if I do ACTION Y, I expect REWARD Z"
```
### 2. State
```
What the AI "sees" at any moment.
Example: (most_tried_zone, best_zone_so_far)
```
### 3. Action
```
What the AI can do.
Example: Place sensors in zones (2, 3, 7, 8)
```
### 4. Reward
```
Points for good actions.
Example: +1 for each thief caught
```
### 5. Epsilon (ฮต)
```
Exploration rate.
ฮต = 1.0 โ†’ 100% random (exploring)
ฮต = 0.01 โ†’ 1% random (exploiting knowledge)
```
### 6. Learning Formula
```
Q(s,a) = Q(s,a) + ฮฑ ร— (reward + ฮณ ร— max(Q(s',a')) - Q(s,a))
In simple terms:
New Memory = Old Memory + Learning Rate ร— (Reality - Expectation)
```
---
## ๐ŸŽฏ Why This Matters
This same technique is used in:
- ๐ŸŽฎ Game AI (AlphaGo, Chess engines)
- ๐Ÿš— Self-driving cars
- ๐Ÿค– Robots
- ๐Ÿ“ฑ Recommendation systems
**You just learned how real AI works!** ๐ŸŽ“
""")
gr.Markdown("""
---
### ๐Ÿ”— About
This demo shows **Q-Learning Reinforcement Learning** for sensor placement.
The AI learns through trial-and-error, just like humans!
""")
# Launch
if __name__ == "__main__":
demo.launch()