#!/bin/bash # HuggingFace 数据集准备和上传 - 快速开始脚本 set -e # 遇到错误立即退出 echo "========================================" echo "HuggingFace 数据集准备和上传" echo "========================================" # 检查必要的工具 echo "" echo "步骤 1: 检查环境..." if ! command -v python3 &> /dev/null; then echo "[ERROR] Python3 未安装" exit 1 fi echo " ✓ Python3" if ! command -v pip &> /dev/null; then echo "[ERROR] pip 未安装" exit 1 fi echo " ✓ pip" # 安装依赖 echo "" echo "步骤 2: 安装依赖..." pip install -q huggingface_hub datasets tqdm echo " ✓ 依赖已安装" # 检查 HF_TOKEN echo "" echo "步骤 3: 检查 HuggingFace Token..." if [ -z "$HF_TOKEN" ]; then echo "[WARNING] HF_TOKEN 环境变量未设置" echo "" echo "请设置你的 HuggingFace Token:" echo " 1. 访问 https://huggingface.co/settings/tokens" echo " 2. 创建一个 Write 权限的 token" echo " 3. 运行: export HF_TOKEN='your_token_here'" echo "" read -p "是否继续(仅准备数据集,不上传)? (y/n): " continue_without_token if [ "$continue_without_token" != "y" ]; then exit 1 fi else echo " ✓ HF_TOKEN 已设置" fi # 准备数据集 echo "" echo "========================================" echo "准备数据集" echo "========================================" echo "" read -p "采样数量 (默认: 2500): " sample_count sample_count=${sample_count:-2500} echo "" echo "开始准备数据集 (采样 $sample_count 个视频)..." python3 prepare_hf_dataset.py if [ $? -ne 0 ]; then echo "[ERROR] 数据集准备失败" exit 1 fi echo "" echo " ✓ 数据集准备完成" # 检查数据集 echo "" echo "========================================" echo "验证数据集" echo "========================================" DATASET_PATH="/playpen-ssd/dataset/droid_raw/hg_data" if [ ! -d "$DATASET_PATH" ]; then echo "[ERROR] 数据集目录不存在: $DATASET_PATH" exit 1 fi video_count=$(ls -1 "$DATASET_PATH/videos" | wc -l) metadata_count=$(ls -1 "$DATASET_PATH/metadata" | wc -l) echo " 视频数量: $video_count" echo " 元数据数量: $metadata_count" if [ "$video_count" -ne "$metadata_count" ]; then echo "[WARNING] 视频和元数据数量不匹配" fi # 计算大小 dataset_size=$(du -sh "$DATASET_PATH" | cut -f1) echo " 数据集大小: $dataset_size" # 上传到 HuggingFace if [ -n "$HF_TOKEN" ]; then echo "" echo "========================================" echo "上传到 HuggingFace" echo "========================================" echo "" read -p "是否上传到 HuggingFace? (y/n): " upload_choice if [ "$upload_choice" == "y" ]; then echo "" echo "请确认配置:" echo " - 编辑 upload_to_huggingface.py" echo " - 设置 HF_USERNAME (你的用户名)" echo " - 设置 DATASET_NAME (数据集名称)" echo "" read -p "配置已完成,继续上传? (y/n): " confirm_upload if [ "$confirm_upload" == "y" ]; then python3 upload_to_huggingface.py echo "" echo " ✓ 上传完成" fi fi else echo "" echo "[INFO] 跳过上传步骤(未设置 HF_TOKEN)" fi # 完成 echo "" echo "========================================" echo "完成!" echo "========================================" echo "" echo "数据集位置: $DATASET_PATH" echo "" echo "下一步:" echo " 1. 查看数据集: ls $DATASET_PATH" echo " 2. 查看 README: cat $DATASET_PATH/README.md" if [ -n "$HF_TOKEN" ]; then echo " 3. 访问 HuggingFace 查看数据集" else echo " 3. 设置 HF_TOKEN 并上传到 HuggingFace" fi echo "" echo "详细指南: cat UPLOAD_GUIDE.md"