a2c-PandaReachDense-v3 / Unit_6_upload.py

Upload Unit_6_upload.py with huggingface_hub

eef4ddc verified 10 days ago

13.9 kB

	# ============================================================
	# Unit 6_upload.py - 智能上传（优先使用最佳模型）
	# ============================================================

	import gymnasium as gym
	import panda_gym
	import numpy as np
	import os
	import shutil
	from stable_baselines3 import A2C
	from stable_baselines3.common.env_util import make_vec_env
	from stable_baselines3.common.vec_env import VecNormalize, VecVideoRecorder
	from stable_baselines3.common.evaluation import evaluate_policy
	from huggingface_hub import HfApi, create_repo

	# ============================================================
	# 配置参数（⚠️ 修改这里）
	# ============================================================
	USERNAME = "ImaghT" # 你的 HF 用户名
	MODEL_NAME = "a2c-PandaReachDense-v3"
	ENV_ID = "PandaReachDense-v3"
	N_EVAL_EPISODES = 20 # 增加评估episodes获得更准确结果

	repo_id = f"{USERNAME}/{MODEL_NAME}"

	# ============================================================
	# 1. 智能文件检测（优先使用最佳模型）
	# ============================================================
	print("="*60)
	print("🔍 检测可用模型文件...")
	print("="*60)

	# 文件路径定义
	BEST_MODEL_PATH = "/home/eason/Workspace/RL/Unit_6/logs/best_model"
	BEST_VEC_NORMALIZE_PATH = "/home/eason/Workspace/RL/Unit_6/logs/best_model_vecnormalize.pkl"
	FINAL_MODEL_PATH = "a2c-PandaReachDense-v3"
	FINAL_VEC_NORMALIZE_PATH = "vec_normalize.pkl"

	# 🎯 优先级检查：最佳模型 > 最终模型
	if os.path.exists(f"{BEST_MODEL_PATH}.zip") and os.path.exists(BEST_VEC_NORMALIZE_PATH):
	print("✅ 发现训练期间保存的最佳模型（推荐使用）")
	MODEL_PATH = BEST_MODEL_PATH
	VEC_NORMALIZE_PATH = BEST_VEC_NORMALIZE_PATH
	model_source = "best_model"
	elif os.path.exists(f"{FINAL_MODEL_PATH}.zip") and os.path.exists(FINAL_VEC_NORMALIZE_PATH):
	print("✅ 发现最终训练模型")
	MODEL_PATH = FINAL_MODEL_PATH
	VEC_NORMALIZE_PATH = FINAL_VEC_NORMALIZE_PATH
	model_source = "final_model"
	else:
	print("❌ 错误: 未找到可用的模型文件！")
	print("\n请确保以下文件之一存在:")
	print(f" 方案1: {BEST_MODEL_PATH}.zip + {BEST_VEC_NORMALIZE_PATH}")
	print(f" 方案2: {FINAL_MODEL_PATH}.zip + {FINAL_VEC_NORMALIZE_PATH}")
	print("\n请先运行 Unit 6.py 训练代码。")
	exit(1)

	print(f"📁 使用模型: {MODEL_PATH}")
	print(f"📁 使用归一化: {VEC_NORMALIZE_PATH}")
	print(f"📊 模型来源: {model_source}\n")

	# ============================================================
	# 2. 加载模型
	# ============================================================
	print("加载模型...")
	eval_env = make_vec_env(ENV_ID, n_envs=1)
	eval_env = VecNormalize.load(VEC_NORMALIZE_PATH, eval_env)
	eval_env.training = False
	eval_env.norm_reward = False

	model = A2C.load(MODEL_PATH, env=eval_env)
	print("✅ 模型加载成功\n")

	# ============================================================
	# 3. 评估模型
	# ============================================================
	print("="*60)
	print(f"🧪 开始评估 ({N_EVAL_EPISODES} episodes)...")
	print("="*60)

	mean_reward, std_reward = evaluate_policy(
	model,
	eval_env,
	n_eval_episodes=N_EVAL_EPISODES,
	deterministic=True
	)

	score = mean_reward - std_reward

	print("\n" + "="*60)
	print("📊 评估结果:")
	print(f" Mean Reward: {mean_reward:.2f}")
	print(f" Std Reward: {std_reward:.2f}")
	print(f" Score (mean-std): {score:.2f}")
	print(f" 通过基准线: -3.5")
	if score >= -3.5:
	print(f" ✅ 状态: PASSED")
	status_emoji = "✅"
	else:
	print(f" ❌ 状态: NOT PASSED (还差 {-3.5 - score:.2f} 分)")
	status_emoji = "❌"
	print("="*60 + "\n")

	# ============================================================
	# 4. 生成演示视频
	# ============================================================
	print("🎬 生成演示视频...")
	video_folder = "/home/eason/Workspace/RL/Unit_6/video_upload"
	os.makedirs(video_folder, exist_ok=True)

	video_env = make_vec_env(ENV_ID, n_envs=1)
	video_env = VecNormalize.load(VEC_NORMALIZE_PATH, video_env)
	video_env.training = False
	video_env.norm_reward = False

	video_env = VecVideoRecorder(
	video_env,
	video_folder,
	record_video_trigger=lambda x: x == 0,
	video_length=500,
	name_prefix="panda-reach-agent"
	)

	obs = video_env.reset()
	for _ in range(500):
	action, _ = model.predict(obs, deterministic=True)
	obs, _, _, _ = video_env.step(action)

	video_env.close()
	print(f"✅ 视频已生成\n")

	# ============================================================
	# 5. 检查训练日志（可选信息）
	# ============================================================
	training_info = ""
	if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"):
	try:
	evaluations = np.load("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz")
	timesteps = evaluations['timesteps']
	results = evaluations['results']

	# 获取训练过程信息
	total_evals = len(timesteps)
	final_timestep = timesteps[-1] if len(timesteps) > 0 else "Unknown"
	best_eval_reward = np.max(results.mean(axis=1)) if len(results) > 0 else "Unknown"

	training_info = f"""
	## Training Monitoring

	This model was trained with comprehensive monitoring:

	- Total Evaluations: {total_evals} (every 500,000 steps)
	- Final Training Step: {final_timestep:,}
	- Best Evaluation Reward: {best_eval_reward:.2f}
	- Model Source: {"Best model from training" if model_source == "best_model" else "Final training model"}
	- Callbacks Used: EvalCallback, CheckpointCallback
	- TensorBoard Logging: Enabled

	"""
	print(f"📈 发现训练日志: {total_evals} 次评估记录")
	except Exception as e:
	print(f"⚠️ 读取训练日志失败: {e}")
	training_info = "\n## Training Monitoring\n\nModel trained with monitoring callbacks.\n"
	else:
	training_info = "\n## Training Configuration\n\nStandard training without detailed monitoring.\n"

	# ============================================================
	# 6. 创建增强版 README.md
	# ============================================================
	readme_content = f"""---
	library_name: stable-baselines3
	tags:
	- PandaReachDense-v3
	- deep-reinforcement-learning
	- reinforcement-learning
	- stable-baselines3
	- robotics
	- panda-gym
	model-index:
	- name: A2C
	results:
	- task:
	type: reinforcement-learning
	name: reinforcement-learning
	dataset:
	name: PandaReachDense-v3
	type: PandaReachDense-v3
	metrics:
	- type: mean_reward
	value: {mean_reward:.2f} +/- {std_reward:.2f}
	name: mean_reward
	verified: false
	---

	# {status_emoji} A2C Agent playing PandaReachDense-v3

	This is a trained model of a A2C agent playing PandaReachDense-v3
	using the [stable-baselines3 library](https://github.com/DLR-RM/stable-baselines3)
	and the [Deep Reinforcement Learning Course](https://huggingface.co/deep-rl-course/unit6).

	This environment is part of the [Panda-Gym](https://github.com/qgallouedec/panda-gym) environments and includes robotic manipulation tasks where the robot arm needs to reach a target position.

	## 🏆 Evaluation Results

	\| Metric \| Value \|
	\|--------\|-------\|
	\| Mean Reward \| {mean_reward:.2f} \|
	\| Std Reward \| {std_reward:.2f} \|
	\| Score (mean - std) \| {score:.2f} \|
	\| Baseline Required \| -3.5 \|
	\| Evaluation Episodes \| {N_EVAL_EPISODES} \|
	\| Status \| {status_emoji} {"PASSED" if score >= -3.5 else "NOT PASSED"} \|
	\| Model Source \| {model_source.replace('_', ' ').title()} \|

	{training_info}

	## 🚀 Usage

	```python
	import gymnasium as gym
	import panda_gym
	from stable_baselines3 import A2C
	from stable_baselines3.common.env_util import make_vec_env
	from stable_baselines3.common.vec_env import VecNormalize

	# Load environment and normalization
	env = make_vec_env("PandaReachDense-v3", n_envs=1)
	env = VecNormalize.load("vec_normalize.pkl", env)

	# ⚠️ CRITICAL: disable training mode and reward normalization at test time
	env.training = False
	env.norm_reward = False

	# Load model
	model = A2C.load("a2c-PandaReachDense-v3", env=env)

	# Run inference
	obs = env.reset()
	for _ in range(1000):
	action, _states = model.predict(obs, deterministic=True)
	obs, reward, done, info = env.step(action)
	if done:
	obs = env.reset()
	```

	## 🔧 Training Configuration

	- Algorithm: A2C (Advantage Actor-Critic)
	- Policy: MultiInputPolicy (for Dict observation spaces)
	- Environment: PandaReachDense-v3
	- Total Timesteps: 200,0000
	- Number of Parallel Envs: 64
	- Normalization: VecNormalize (observation + reward)
	- Observation Clipping: 10.0
	- Evaluation Frequency: Every 500,000 steps
	- Checkpoint Frequency: Every 500,000 steps

	## 🤖 Model Architecture

	The agent uses a MultiInputPolicy because the observation space is a dictionary containing:
	- `observation`: Robot joint positions, velocities, and gripper state
	- `desired_goal`: Target position coordinates (x, y, z)
	- `achieved_goal`: Current end-effector position coordinates (x, y, z)

	The goal is to minimize the distance between `achieved_goal` and `desired_goal`.

	## 📈 Performance Notes

	- Reward Range: Typically from -50 (far from target) to 0 (at target)
	- Success Criteria: Achieving mean reward > -3.5 consistently
	- Episode Length: Usually 50 steps per episode
	- Convergence: Expect improvement after 200k-500k steps

	## 🎯 Tips for Reproduction

	1. Normalization is Critical: Always use VecNormalize for robotic tasks
	2. MultiInputPolicy Required: Dict observation spaces need special handling
	3. Sufficient Training: 1M+ timesteps recommended for stable performance
	4. Evaluation: Use deterministic=True for consistent evaluation results
	"""

	# ============================================================
	# 7. 准备上传文件
	# ============================================================
	print("📦 准备上传文件...")
	upload_folder = "/home/eason/Workspace/RL/Unit_6/upload_temp"
	os.makedirs(upload_folder, exist_ok=True)

	# 保存 README
	readme_path = os.path.join(upload_folder, "README.md")
	with open(readme_path, "w", encoding="utf-8") as f:
	f.write(readme_content)
	print(f"✅ 创建 README.md")

	# 复制模型文件（重命名为标准名称）
	model_dest = os.path.join(upload_folder, f"{MODEL_NAME}.zip")
	shutil.copy(f"{MODEL_PATH}.zip", model_dest)
	print(f"✅ 复制模型文件: {MODEL_PATH}.zip -> {MODEL_NAME}.zip")

	# 复制归一化文件（重命名为标准名称）
	vec_norm_dest = os.path.join(upload_folder, "vec_normalize.pkl")
	shutil.copy(VEC_NORMALIZE_PATH, vec_norm_dest)
	print(f"✅ 复制归一化文件: {VEC_NORMALIZE_PATH} -> vec_normalize.pkl")

	# 复制视频文件
	video_files = [f for f in os.listdir(video_folder) if f.endswith(".mp4")]
	if video_files:
	video_src = os.path.join(video_folder, video_files[0])
	video_dest = os.path.join(upload_folder, "replay.mp4")
	shutil.copy(video_src, video_dest)
	print(f"✅ 复制视频文件")
	else:
	print(f"⚠️ 未找到视频文件（可选）")

	# 可选：复制训练日志
	if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"):
	eval_dest = os.path.join(upload_folder, "training_evaluations.npz")
	shutil.copy("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz", eval_dest)
	print(f"✅ 复制训练评估日志")

	# ============================================================
	# 8. 上传到 Hugging Face Hub
	# ============================================================
	print(f"\n🚀 上传到 {repo_id}...")

	api = HfApi()

	try:
	# 创建仓库（如果已存在则跳过）
	create_repo(repo_id, repo_type="model", exist_ok=True)
	print(f"✅ 仓库已创建/验证")
	except Exception as e:
	print(f"⚠️ 仓库警告: {e}")

	try:
	# 上传整个文件夹
	commit_message = f"A2C PandaReach ({model_source}) - Mean: {mean_reward:.2f}, Std: {std_reward:.2f}, Score: {score:.2f}"

	api.upload_folder(
	folder_path=upload_folder,
	repo_id=repo_id,
	repo_type="model",
	commit_message=commit_message
	)
	print(f"\n{'='*60}")
	print("🎉 上传成功！")
	print(f"{'='*60}")
	print(f"🔗 模型页面: https://huggingface.co/{repo_id}")
	print(f"🏆 检查进度: https://huggingface.co/spaces/ThomasSimonini/Check-my-progress-Deep-RL-Course")
	print(f"📊 模型来源: {model_source.replace('_', ' ').title()}")
	print(f"🎯 评估分数: {score:.2f} ({'通过' if score >= -3.5 else '未通过'})")
	print(f"{'='*60}\n")
	except Exception as e:
	print(f"\n❌ 上传失败: {e}")
	print(" 请检查:")
	print(" 1. 是否已运行 'huggingface-cli login'")
	print(" 2. 网络连接是否正常")
	print(" 3. 用户名是否正确\n")
	finally:
	# 清理临时文件
	shutil.rmtree(upload_folder)
	print("🧹 清理临时文件")

	print("✨ 完成！")

	# ============================================================
	# 9. 额外信息输出
	# ============================================================
	print("\n" + "="*60)
	print("📋 上传总结")
	print("="*60)
	print(f"📁 上传的文件:")
	print(f" - {MODEL_NAME}.zip (模型)")
	print(f" - vec_normalize.pkl (归一化参数)")
	print(f" - README.md (文档)")
	print(f" - replay.mp4 (演示视频)")
	if os.path.exists("/home/eason/Workspace/RL/Unit_6/logs/evaluations.npz"):
	print(f" - training_evaluations.npz (训练日志)")

	print(f"\n🎯 关键信息:")
	print(f" - 使用了 {'最佳' if model_source == 'best_model' else '最终'} 模型")
	print(f" - 评估分数: {score:.2f}")
	print(f" - 状态: {'✅ 通过' if score >= -3.5 else '❌ 未通过'}")
	print("="*60)