File size: 13,896 Bytes

eef4ddc

# ============================================================
# Unit 6_upload.py - 智能上传（优先使用最佳模型）
# ============================================================

import gymnasium as gym
import panda_gym
import numpy as np
import os
import shutil
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize, VecVideoRecorder
from stable_baselines3.common.evaluation import evaluate_policy
from huggingface_hub import HfApi, create_repo

# ============================================================
# 配置参数（⚠️ 修改这里）
# ============================================================
USERNAME = "ImaghT"  # 你的 HF 用户名
MODEL_NAME = "a2c-PandaReachDense-v3"
ENV_ID = "PandaReachDense-v3"
N_EVAL_EPISODES = 20  # 增加评估episodes获得更准确结果

repo_id = f"{USERNAME}/{MODEL_NAME}"

# ============================================================
# 1. 智能文件检测（优先使用最佳模型）
# ============================================================
print("="*60)
print("🔍 检测可用模型文件...")
print("="*60)

# 文件路径定义
BEST_MODEL_PATH = "/home/eason/Workspace/RL/Unit_6/logs/best_model"
BEST_VEC_NORMALIZE_PATH = "/home/eason/Workspace/RL/Unit_6/logs/best_model_vecnormalize.pkl"
FINAL_MODEL_PATH = "a2c-PandaReachDense-v3"
FINAL_VEC_NORMALIZE_PATH = "vec_normalize.pkl"

# 🎯 优先级检查：最佳模型 > 最终模型
if os.path.exists(f"{BEST_MODEL_PATH}.zip") and os.path.exists(BEST_VEC_NORMALIZE_PATH):
    print("✅ 发现训练期间保存的最佳模型（推荐使用）")
    MODEL_PATH = BEST_MODEL_PATH
    VEC_NORMALIZE_PATH = BEST_VEC_NORMALIZE_PATH
    model_source = "best_model"
elif os.path.exists(f"{FINAL_MODEL_PATH}.zip") and os.path.exists(FINAL_VEC_NORMALIZE_PATH):
    print("✅ 发现最终训练模型")
    MODEL_PATH = FINAL_MODEL_PATH
    VEC_NORMALIZE_PATH = FINAL_VEC_NORMALIZE_PATH
    model_source = "final_model"
else:
    print("❌ 错误: 未找到可用的模型文件！")
    print("\n请确保以下文件之一存在:")
    print(f"   方案1: {BEST_MODEL_PATH}.zip + {BEST_VEC_NORMALIZE_PATH}")
    print(f"   方案2: {FINAL_MODEL_PATH}.zip + {FINAL_VEC_NORMALIZE_PATH}")
    print("\n请先运行 Unit 6.py 训练代码。")
    exit(1)

print(f"📁 使用模型: {MODEL_PATH}")
print(f"📁 使用归一化: {VEC_NORMALIZE_PATH}")
print(f"📊 模型来源: {model_source}\n")

# ============================================================
# 2. 加载模型
# ============================================================
print("加载模型...")
eval_env = make_vec_env(ENV_ID, n_envs=1)
eval_env = VecNormalize.load(VEC_NORMALIZE_PATH, eval_env)
eval_env.training = False
eval_env.norm_reward = False

model = A2C.load(MODEL_PATH, env=eval_env)
print("✅ 模型加载成功\n")

# ============================================================
# 3. 评估模型
# ============================================================
print("="*60)
print(f"🧪 开始评估 ({N_EVAL_EPISODES} episodes)...")
print("="*60)

mean_reward, std_reward = evaluate_policy(
    model, 
    eval_env, 
    n_eval_episodes=N_EVAL_EPISODES, 
    deterministic=True
)

score = mean_reward - std_reward

print("\n" + "="*60)
print("📊 评估结果:")
print(f"  Mean Reward:       {mean_reward:.2f}")
print(f"  Std Reward:        {std_reward:.2f}")
print(f"  Score (mean-std):  {score:.2f}")
print(f"  通过基准线:         -3.5")
if score >= -3.5:
    print(f"  ✅ 状态: PASSED")
    status_emoji = "✅"
else:
    print(f"  ❌ 状态: NOT PASSED (还差 {-3.5 - score:.2f} 分)")
    status_emoji = "❌"
print("="*60 + "\n")

# ============================================================
# 4. 生成演示视频
# ============================================================
print("🎬 生成演示视频...")
video_folder = "/home/eason/Workspace/RL/Unit_6/video_upload"
os.makedirs(video_folder, exist_ok=True)

video_env = make_vec_env(ENV_ID, n_envs=1)
video_env = VecNormalize.load(VEC_NORMALIZE_PATH, video_env)
video_env.training = False
video_env.norm_reward = False

video_env = VecVideoRecorder(
    video_env,
    video_folder,
    record_video_trigger=lambda x: x == 0,
    video_length=500,
    name_prefix="panda-reach-agent"
)

obs = video_env.reset()
for _ in range(500):
    action, _ = model.predict(obs, deterministic=True)
    obs, _, _, _ = video_env.step(action)

video_env.close()
print(f"✅ 视频已生成\n")

# ============================================================
# 5. 检查训练日志（可选信息）
# ============================================================
training_info = ""
if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"):
    try:
        evaluations = np.load("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz")
        timesteps = evaluations['timesteps']
        results = evaluations['results']
        
        # 获取训练过程信息
        total_evals = len(timesteps)
        final_timestep = timesteps[-1] if len(timesteps) > 0 else "Unknown"
        best_eval_reward = np.max(results.mean(axis=1)) if len(results) > 0 else "Unknown"
        
        training_info = f"""
## Training Monitoring

This model was trained with comprehensive monitoring:

- **Total Evaluations**: {total_evals} (every 500,000 steps)
- **Final Training Step**: {final_timestep:,}
- **Best Evaluation Reward**: {best_eval_reward:.2f}
- **Model Source**: {"Best model from training" if model_source == "best_model" else "Final training model"}
- **Callbacks Used**: EvalCallback, CheckpointCallback
- **TensorBoard Logging**: Enabled

"""
        print(f"📈 发现训练日志: {total_evals} 次评估记录")
    except Exception as e:
        print(f"⚠️ 读取训练日志失败: {e}")
        training_info = "\n## Training Monitoring\n\nModel trained with monitoring callbacks.\n"
else:
    training_info = "\n## Training Configuration\n\nStandard training without detailed monitoring.\n"

# ============================================================
# 6. 创建增强版 README.md
# ============================================================
readme_content = f"""---
library_name: stable-baselines3
tags:
- PandaReachDense-v3
- deep-reinforcement-learning
- reinforcement-learning
- stable-baselines3
- robotics
- panda-gym
model-index:
- name: A2C
  results:
  - task:
      type: reinforcement-learning
      name: reinforcement-learning
    dataset:
      name: PandaReachDense-v3
      type: PandaReachDense-v3
    metrics:
    - type: mean_reward
      value: {mean_reward:.2f} +/- {std_reward:.2f}
      name: mean_reward
      verified: false
---

# {status_emoji} **A2C** Agent playing **PandaReachDense-v3**

This is a trained model of a **A2C** agent playing **PandaReachDense-v3**
using the [stable-baselines3 library](https://github.com/DLR-RM/stable-baselines3)
and the [Deep Reinforcement Learning Course](https://huggingface.co/deep-rl-course/unit6).

This environment is part of the [Panda-Gym](https://github.com/qgallouedec/panda-gym) environments and includes robotic manipulation tasks where the robot arm needs to reach a target position.

## 🏆 Evaluation Results

| Metric | Value |
|--------|-------|
| Mean Reward | {mean_reward:.2f} |
| Std Reward | {std_reward:.2f} |
| **Score (mean - std)** | **{score:.2f}** |
| Baseline Required | -3.5 |
| Evaluation Episodes | {N_EVAL_EPISODES} |
| Status | {status_emoji} {"**PASSED**" if score >= -3.5 else "**NOT PASSED**"} |
| Model Source | {model_source.replace('_', ' ').title()} |

{training_info}

## 🚀 Usage

```python
import gymnasium as gym
import panda_gym
from stable_baselines3 import A2C
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecNormalize

# Load environment and normalization 
env = make_vec_env("PandaReachDense-v3", n_envs=1)
env = VecNormalize.load("vec_normalize.pkl", env)

# ⚠️ CRITICAL: disable training mode and reward normalization at test time 
env.training = False
env.norm_reward = False

# Load model 
model = A2C.load("a2c-PandaReachDense-v3", env=env)

# Run inference
obs = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    if done:
        obs = env.reset()
``` 

## 🔧 Training Configuration

- **Algorithm**: A2C (Advantage Actor-Critic)
- **Policy**: MultiInputPolicy (for Dict observation spaces)
- **Environment**: PandaReachDense-v3
- **Total Timesteps**: 200,0000
- **Number of Parallel Envs**: 64
- **Normalization**: VecNormalize (observation + reward)
- **Observation Clipping**: 10.0
- **Evaluation Frequency**: Every 500,000 steps
- **Checkpoint Frequency**: Every 500,000 steps

## 🤖 Model Architecture

The agent uses a **MultiInputPolicy** because the observation space is a dictionary containing:
- `observation`: Robot joint positions, velocities, and gripper state
- `desired_goal`: Target position coordinates (x, y, z)
- `achieved_goal`: Current end-effector position coordinates (x, y, z)

The goal is to minimize the distance between `achieved_goal` and `desired_goal`.

## 📈 Performance Notes

- **Reward Range**: Typically from -50 (far from target) to 0 (at target)
- **Success Criteria**: Achieving mean reward > -3.5 consistently
- **Episode Length**: Usually 50 steps per episode
- **Convergence**: Expect improvement after 200k-500k steps

## 🎯 Tips for Reproduction

1. **Normalization is Critical**: Always use VecNormalize for robotic tasks
2. **MultiInputPolicy Required**: Dict observation spaces need special handling  
3. **Sufficient Training**: 1M+ timesteps recommended for stable performance
4. **Evaluation**: Use deterministic=True for consistent evaluation results
"""

# ============================================================
# 7. 准备上传文件
# ============================================================
print("📦 准备上传文件...")
upload_folder = "/home/eason/Workspace/RL/Unit_6/upload_temp"
os.makedirs(upload_folder, exist_ok=True)

# 保存 README
readme_path = os.path.join(upload_folder, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
    f.write(readme_content)
print(f"✅ 创建 README.md")

# 复制模型文件（重命名为标准名称）
model_dest = os.path.join(upload_folder, f"{MODEL_NAME}.zip")
shutil.copy(f"{MODEL_PATH}.zip", model_dest)
print(f"✅ 复制模型文件: {MODEL_PATH}.zip -> {MODEL_NAME}.zip")

# 复制归一化文件（重命名为标准名称）
vec_norm_dest = os.path.join(upload_folder, "vec_normalize.pkl")
shutil.copy(VEC_NORMALIZE_PATH, vec_norm_dest)
print(f"✅ 复制归一化文件: {VEC_NORMALIZE_PATH} -> vec_normalize.pkl")

# 复制视频文件
video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]
if video_files:
    video_src = os.path.join(video_folder, video_files[0])
    video_dest = os.path.join(upload_folder, "replay.mp4")
    shutil.copy(video_src, video_dest)
    print(f"✅ 复制视频文件")
else:
    print(f"⚠️ 未找到视频文件（可选）")

# 可选：复制训练日志
if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"):
    eval_dest = os.path.join(upload_folder, "training_evaluations.npz")
    shutil.copy("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz", eval_dest)
    print(f"✅ 复制训练评估日志")

# ============================================================
# 8. 上传到 Hugging Face Hub
# ============================================================
print(f"\n🚀 上传到 {repo_id}...")

api = HfApi()

try:
    # 创建仓库（如果已存在则跳过）
    create_repo(repo_id, repo_type="model", exist_ok=True)
    print(f"✅ 仓库已创建/验证")
except Exception as e:
    print(f"⚠️ 仓库警告: {e}")

try:
    # 上传整个文件夹
    commit_message = f"A2C PandaReach ({model_source}) - Mean: {mean_reward:.2f}, Std: {std_reward:.2f}, Score: {score:.2f}"
    
    api.upload_folder(
        folder_path=upload_folder,
        repo_id=repo_id,
        repo_type="model",
        commit_message=commit_message
    )
    print(f"\n{'='*60}")
    print("🎉 上传成功！")
    print(f"{'='*60}")
    print(f"🔗 模型页面: https://huggingface.co/{repo_id}")
    print(f"🏆 检查进度: https://huggingface.co/spaces/ThomasSimonini/Check-my-progress-Deep-RL-Course")
    print(f"📊 模型来源: {model_source.replace('_', ' ').title()}")
    print(f"🎯 评估分数: {score:.2f} ({'通过' if score >= -3.5 else '未通过'})")
    print(f"{'='*60}\n")
except Exception as e:
    print(f"\n❌ 上传失败: {e}")
    print("   请检查:")
    print("   1. 是否已运行 'huggingface-cli login'")
    print("   2. 网络连接是否正常")
    print("   3. 用户名是否正确\n")
finally:
    # 清理临时文件
    shutil.rmtree(upload_folder)
    print("🧹 清理临时文件")

print("✨ 完成！")

# ============================================================
# 9. 额外信息输出
# ============================================================
print("\n" + "="*60)
print("📋 上传总结")
print("="*60)
print(f"📁 上传的文件:")
print(f"   - {MODEL_NAME}.zip (模型)")
print(f"   - vec_normalize.pkl (归一化参数)")
print(f"   - README.md (文档)")
print(f"   - replay.mp4 (演示视频)")
if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"):
    print(f"   - training_evaluations.npz (训练日志)")

print(f"\n🎯 关键信息:")
print(f"   - 使用了 {'最佳' if model_source == 'best_model' else '最终'} 模型")
print(f"   - 评估分数: {score:.2f}")
print(f"   - 状态: {'✅ 通过' if score >= -3.5 else '❌ 未通过'}")
print("="*60)