|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gymnasium as gym |
|
|
import panda_gym |
|
|
import numpy as np |
|
|
import os |
|
|
import shutil |
|
|
from stable_baselines3 import A2C |
|
|
from stable_baselines3.common.env_util import make_vec_env |
|
|
from stable_baselines3.common.vec_env import VecNormalize, VecVideoRecorder |
|
|
from stable_baselines3.common.evaluation import evaluate_policy |
|
|
from huggingface_hub import HfApi, create_repo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
USERNAME = "ImaghT" |
|
|
MODEL_NAME = "a2c-PandaReachDense-v3" |
|
|
ENV_ID = "PandaReachDense-v3" |
|
|
N_EVAL_EPISODES = 20 |
|
|
|
|
|
repo_id = f"{USERNAME}/{MODEL_NAME}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*60) |
|
|
print("🔍 检测可用模型文件...") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
BEST_MODEL_PATH = "/home/eason/Workspace/RL/Unit_6/logs/best_model" |
|
|
BEST_VEC_NORMALIZE_PATH = "/home/eason/Workspace/RL/Unit_6/logs/best_model_vecnormalize.pkl" |
|
|
FINAL_MODEL_PATH = "a2c-PandaReachDense-v3" |
|
|
FINAL_VEC_NORMALIZE_PATH = "vec_normalize.pkl" |
|
|
|
|
|
|
|
|
if os.path.exists(f"{BEST_MODEL_PATH}.zip") and os.path.exists(BEST_VEC_NORMALIZE_PATH): |
|
|
print("✅ 发现训练期间保存的最佳模型(推荐使用)") |
|
|
MODEL_PATH = BEST_MODEL_PATH |
|
|
VEC_NORMALIZE_PATH = BEST_VEC_NORMALIZE_PATH |
|
|
model_source = "best_model" |
|
|
elif os.path.exists(f"{FINAL_MODEL_PATH}.zip") and os.path.exists(FINAL_VEC_NORMALIZE_PATH): |
|
|
print("✅ 发现最终训练模型") |
|
|
MODEL_PATH = FINAL_MODEL_PATH |
|
|
VEC_NORMALIZE_PATH = FINAL_VEC_NORMALIZE_PATH |
|
|
model_source = "final_model" |
|
|
else: |
|
|
print("❌ 错误: 未找到可用的模型文件!") |
|
|
print("\n请确保以下文件之一存在:") |
|
|
print(f" 方案1: {BEST_MODEL_PATH}.zip + {BEST_VEC_NORMALIZE_PATH}") |
|
|
print(f" 方案2: {FINAL_MODEL_PATH}.zip + {FINAL_VEC_NORMALIZE_PATH}") |
|
|
print("\n请先运行 Unit 6.py 训练代码。") |
|
|
exit(1) |
|
|
|
|
|
print(f"📁 使用模型: {MODEL_PATH}") |
|
|
print(f"📁 使用归一化: {VEC_NORMALIZE_PATH}") |
|
|
print(f"📊 模型来源: {model_source}\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("加载模型...") |
|
|
eval_env = make_vec_env(ENV_ID, n_envs=1) |
|
|
eval_env = VecNormalize.load(VEC_NORMALIZE_PATH, eval_env) |
|
|
eval_env.training = False |
|
|
eval_env.norm_reward = False |
|
|
|
|
|
model = A2C.load(MODEL_PATH, env=eval_env) |
|
|
print("✅ 模型加载成功\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("="*60) |
|
|
print(f"🧪 开始评估 ({N_EVAL_EPISODES} episodes)...") |
|
|
print("="*60) |
|
|
|
|
|
mean_reward, std_reward = evaluate_policy( |
|
|
model, |
|
|
eval_env, |
|
|
n_eval_episodes=N_EVAL_EPISODES, |
|
|
deterministic=True |
|
|
) |
|
|
|
|
|
score = mean_reward - std_reward |
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("📊 评估结果:") |
|
|
print(f" Mean Reward: {mean_reward:.2f}") |
|
|
print(f" Std Reward: {std_reward:.2f}") |
|
|
print(f" Score (mean-std): {score:.2f}") |
|
|
print(f" 通过基准线: -3.5") |
|
|
if score >= -3.5: |
|
|
print(f" ✅ 状态: PASSED") |
|
|
status_emoji = "✅" |
|
|
else: |
|
|
print(f" ❌ 状态: NOT PASSED (还差 {-3.5 - score:.2f} 分)") |
|
|
status_emoji = "❌" |
|
|
print("="*60 + "\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("🎬 生成演示视频...") |
|
|
video_folder = "/home/eason/Workspace/RL/Unit_6/video_upload" |
|
|
os.makedirs(video_folder, exist_ok=True) |
|
|
|
|
|
video_env = make_vec_env(ENV_ID, n_envs=1) |
|
|
video_env = VecNormalize.load(VEC_NORMALIZE_PATH, video_env) |
|
|
video_env.training = False |
|
|
video_env.norm_reward = False |
|
|
|
|
|
video_env = VecVideoRecorder( |
|
|
video_env, |
|
|
video_folder, |
|
|
record_video_trigger=lambda x: x == 0, |
|
|
video_length=500, |
|
|
name_prefix="panda-reach-agent" |
|
|
) |
|
|
|
|
|
obs = video_env.reset() |
|
|
for _ in range(500): |
|
|
action, _ = model.predict(obs, deterministic=True) |
|
|
obs, _, _, _ = video_env.step(action) |
|
|
|
|
|
video_env.close() |
|
|
print(f"✅ 视频已生成\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_info = "" |
|
|
if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"): |
|
|
try: |
|
|
evaluations = np.load("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz") |
|
|
timesteps = evaluations['timesteps'] |
|
|
results = evaluations['results'] |
|
|
|
|
|
|
|
|
total_evals = len(timesteps) |
|
|
final_timestep = timesteps[-1] if len(timesteps) > 0 else "Unknown" |
|
|
best_eval_reward = np.max(results.mean(axis=1)) if len(results) > 0 else "Unknown" |
|
|
|
|
|
training_info = f""" |
|
|
## Training Monitoring |
|
|
|
|
|
This model was trained with comprehensive monitoring: |
|
|
|
|
|
- **Total Evaluations**: {total_evals} (every 500,000 steps) |
|
|
- **Final Training Step**: {final_timestep:,} |
|
|
- **Best Evaluation Reward**: {best_eval_reward:.2f} |
|
|
- **Model Source**: {"Best model from training" if model_source == "best_model" else "Final training model"} |
|
|
- **Callbacks Used**: EvalCallback, CheckpointCallback |
|
|
- **TensorBoard Logging**: Enabled |
|
|
|
|
|
""" |
|
|
print(f"📈 发现训练日志: {total_evals} 次评估记录") |
|
|
except Exception as e: |
|
|
print(f"⚠️ 读取训练日志失败: {e}") |
|
|
training_info = "\n## Training Monitoring\n\nModel trained with monitoring callbacks.\n" |
|
|
else: |
|
|
training_info = "\n## Training Configuration\n\nStandard training without detailed monitoring.\n" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
readme_content = f"""--- |
|
|
library_name: stable-baselines3 |
|
|
tags: |
|
|
- PandaReachDense-v3 |
|
|
- deep-reinforcement-learning |
|
|
- reinforcement-learning |
|
|
- stable-baselines3 |
|
|
- robotics |
|
|
- panda-gym |
|
|
model-index: |
|
|
- name: A2C |
|
|
results: |
|
|
- task: |
|
|
type: reinforcement-learning |
|
|
name: reinforcement-learning |
|
|
dataset: |
|
|
name: PandaReachDense-v3 |
|
|
type: PandaReachDense-v3 |
|
|
metrics: |
|
|
- type: mean_reward |
|
|
value: {mean_reward:.2f} +/- {std_reward:.2f} |
|
|
name: mean_reward |
|
|
verified: false |
|
|
--- |
|
|
|
|
|
# {status_emoji} **A2C** Agent playing **PandaReachDense-v3** |
|
|
|
|
|
This is a trained model of a **A2C** agent playing **PandaReachDense-v3** |
|
|
using the [stable-baselines3 library](https://github.com/DLR-RM/stable-baselines3) |
|
|
and the [Deep Reinforcement Learning Course](https://huggingface.co/deep-rl-course/unit6). |
|
|
|
|
|
This environment is part of the [Panda-Gym](https://github.com/qgallouedec/panda-gym) environments and includes robotic manipulation tasks where the robot arm needs to reach a target position. |
|
|
|
|
|
## 🏆 Evaluation Results |
|
|
|
|
|
| Metric | Value | |
|
|
|--------|-------| |
|
|
| Mean Reward | {mean_reward:.2f} | |
|
|
| Std Reward | {std_reward:.2f} | |
|
|
| **Score (mean - std)** | **{score:.2f}** | |
|
|
| Baseline Required | -3.5 | |
|
|
| Evaluation Episodes | {N_EVAL_EPISODES} | |
|
|
| Status | {status_emoji} {"**PASSED**" if score >= -3.5 else "**NOT PASSED**"} | |
|
|
| Model Source | {model_source.replace('_', ' ').title()} | |
|
|
|
|
|
{training_info} |
|
|
|
|
|
## 🚀 Usage |
|
|
|
|
|
```python |
|
|
import gymnasium as gym |
|
|
import panda_gym |
|
|
from stable_baselines3 import A2C |
|
|
from stable_baselines3.common.env_util import make_vec_env |
|
|
from stable_baselines3.common.vec_env import VecNormalize |
|
|
|
|
|
# Load environment and normalization |
|
|
env = make_vec_env("PandaReachDense-v3", n_envs=1) |
|
|
env = VecNormalize.load("vec_normalize.pkl", env) |
|
|
|
|
|
# ⚠️ CRITICAL: disable training mode and reward normalization at test time |
|
|
env.training = False |
|
|
env.norm_reward = False |
|
|
|
|
|
# Load model |
|
|
model = A2C.load("a2c-PandaReachDense-v3", env=env) |
|
|
|
|
|
# Run inference |
|
|
obs = env.reset() |
|
|
for _ in range(1000): |
|
|
action, _states = model.predict(obs, deterministic=True) |
|
|
obs, reward, done, info = env.step(action) |
|
|
if done: |
|
|
obs = env.reset() |
|
|
``` |
|
|
|
|
|
## 🔧 Training Configuration |
|
|
|
|
|
- **Algorithm**: A2C (Advantage Actor-Critic) |
|
|
- **Policy**: MultiInputPolicy (for Dict observation spaces) |
|
|
- **Environment**: PandaReachDense-v3 |
|
|
- **Total Timesteps**: 200,0000 |
|
|
- **Number of Parallel Envs**: 64 |
|
|
- **Normalization**: VecNormalize (observation + reward) |
|
|
- **Observation Clipping**: 10.0 |
|
|
- **Evaluation Frequency**: Every 500,000 steps |
|
|
- **Checkpoint Frequency**: Every 500,000 steps |
|
|
|
|
|
## 🤖 Model Architecture |
|
|
|
|
|
The agent uses a **MultiInputPolicy** because the observation space is a dictionary containing: |
|
|
- `observation`: Robot joint positions, velocities, and gripper state |
|
|
- `desired_goal`: Target position coordinates (x, y, z) |
|
|
- `achieved_goal`: Current end-effector position coordinates (x, y, z) |
|
|
|
|
|
The goal is to minimize the distance between `achieved_goal` and `desired_goal`. |
|
|
|
|
|
## 📈 Performance Notes |
|
|
|
|
|
- **Reward Range**: Typically from -50 (far from target) to 0 (at target) |
|
|
- **Success Criteria**: Achieving mean reward > -3.5 consistently |
|
|
- **Episode Length**: Usually 50 steps per episode |
|
|
- **Convergence**: Expect improvement after 200k-500k steps |
|
|
|
|
|
## 🎯 Tips for Reproduction |
|
|
|
|
|
1. **Normalization is Critical**: Always use VecNormalize for robotic tasks |
|
|
2. **MultiInputPolicy Required**: Dict observation spaces need special handling |
|
|
3. **Sufficient Training**: 1M+ timesteps recommended for stable performance |
|
|
4. **Evaluation**: Use deterministic=True for consistent evaluation results |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("📦 准备上传文件...") |
|
|
upload_folder = "/home/eason/Workspace/RL/Unit_6/upload_temp" |
|
|
os.makedirs(upload_folder, exist_ok=True) |
|
|
|
|
|
|
|
|
readme_path = os.path.join(upload_folder, "README.md") |
|
|
with open(readme_path, "w", encoding="utf-8") as f: |
|
|
f.write(readme_content) |
|
|
print(f"✅ 创建 README.md") |
|
|
|
|
|
|
|
|
model_dest = os.path.join(upload_folder, f"{MODEL_NAME}.zip") |
|
|
shutil.copy(f"{MODEL_PATH}.zip", model_dest) |
|
|
print(f"✅ 复制模型文件: {MODEL_PATH}.zip -> {MODEL_NAME}.zip") |
|
|
|
|
|
|
|
|
vec_norm_dest = os.path.join(upload_folder, "vec_normalize.pkl") |
|
|
shutil.copy(VEC_NORMALIZE_PATH, vec_norm_dest) |
|
|
print(f"✅ 复制归一化文件: {VEC_NORMALIZE_PATH} -> vec_normalize.pkl") |
|
|
|
|
|
|
|
|
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")] |
|
|
if video_files: |
|
|
video_src = os.path.join(video_folder, video_files[0]) |
|
|
video_dest = os.path.join(upload_folder, "replay.mp4") |
|
|
shutil.copy(video_src, video_dest) |
|
|
print(f"✅ 复制视频文件") |
|
|
else: |
|
|
print(f"⚠️ 未找到视频文件(可选)") |
|
|
|
|
|
|
|
|
if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"): |
|
|
eval_dest = os.path.join(upload_folder, "training_evaluations.npz") |
|
|
shutil.copy("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz", eval_dest) |
|
|
print(f"✅ 复制训练评估日志") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"\n🚀 上传到 {repo_id}...") |
|
|
|
|
|
api = HfApi() |
|
|
|
|
|
try: |
|
|
|
|
|
create_repo(repo_id, repo_type="model", exist_ok=True) |
|
|
print(f"✅ 仓库已创建/验证") |
|
|
except Exception as e: |
|
|
print(f"⚠️ 仓库警告: {e}") |
|
|
|
|
|
try: |
|
|
|
|
|
commit_message = f"A2C PandaReach ({model_source}) - Mean: {mean_reward:.2f}, Std: {std_reward:.2f}, Score: {score:.2f}" |
|
|
|
|
|
api.upload_folder( |
|
|
folder_path=upload_folder, |
|
|
repo_id=repo_id, |
|
|
repo_type="model", |
|
|
commit_message=commit_message |
|
|
) |
|
|
print(f"\n{'='*60}") |
|
|
print("🎉 上传成功!") |
|
|
print(f"{'='*60}") |
|
|
print(f"🔗 模型页面: https://huggingface.co/{repo_id}") |
|
|
print(f"🏆 检查进度: https://huggingface.co/spaces/ThomasSimonini/Check-my-progress-Deep-RL-Course") |
|
|
print(f"📊 模型来源: {model_source.replace('_', ' ').title()}") |
|
|
print(f"🎯 评估分数: {score:.2f} ({'通过' if score >= -3.5 else '未通过'})") |
|
|
print(f"{'='*60}\n") |
|
|
except Exception as e: |
|
|
print(f"\n❌ 上传失败: {e}") |
|
|
print(" 请检查:") |
|
|
print(" 1. 是否已运行 'huggingface-cli login'") |
|
|
print(" 2. 网络连接是否正常") |
|
|
print(" 3. 用户名是否正确\n") |
|
|
finally: |
|
|
|
|
|
shutil.rmtree(upload_folder) |
|
|
print("🧹 清理临时文件") |
|
|
|
|
|
print("✨ 完成!") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n" + "="*60) |
|
|
print("📋 上传总结") |
|
|
print("="*60) |
|
|
print(f"📁 上传的文件:") |
|
|
print(f" - {MODEL_NAME}.zip (模型)") |
|
|
print(f" - vec_normalize.pkl (归一化参数)") |
|
|
print(f" - README.md (文档)") |
|
|
print(f" - replay.mp4 (演示视频)") |
|
|
if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"): |
|
|
print(f" - training_evaluations.npz (训练日志)") |
|
|
|
|
|
print(f"\n🎯 关键信息:") |
|
|
print(f" - 使用了 {'最佳' if model_source == 'best_model' else '最终'} 模型") |
|
|
print(f" - 评估分数: {score:.2f}") |
|
|
print(f" - 状态: {'✅ 通过' if score >= -3.5 else '❌ 未通过'}") |
|
|
print("="*60) |