File size: 3,816 Bytes
a741a7c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | #!/bin/bash
# HuggingFace 数据集准备和上传 - 快速开始脚本
set -e # 遇到错误立即退出
echo "========================================"
echo "HuggingFace 数据集准备和上传"
echo "========================================"
# 检查必要的工具
echo ""
echo "步骤 1: 检查环境..."
if ! command -v python3 &> /dev/null; then
echo "[ERROR] Python3 未安装"
exit 1
fi
echo " ✓ Python3"
if ! command -v pip &> /dev/null; then
echo "[ERROR] pip 未安装"
exit 1
fi
echo " ✓ pip"
# 安装依赖
echo ""
echo "步骤 2: 安装依赖..."
pip install -q huggingface_hub datasets tqdm
echo " ✓ 依赖已安装"
# 检查 HF_TOKEN
echo ""
echo "步骤 3: 检查 HuggingFace Token..."
if [ -z "$HF_TOKEN" ]; then
echo "[WARNING] HF_TOKEN 环境变量未设置"
echo ""
echo "请设置你的 HuggingFace Token:"
echo " 1. 访问 https://huggingface.co/settings/tokens"
echo " 2. 创建一个 Write 权限的 token"
echo " 3. 运行: export HF_TOKEN='your_token_here'"
echo ""
read -p "是否继续(仅准备数据集,不上传)? (y/n): " continue_without_token
if [ "$continue_without_token" != "y" ]; then
exit 1
fi
else
echo " ✓ HF_TOKEN 已设置"
fi
# 准备数据集
echo ""
echo "========================================"
echo "准备数据集"
echo "========================================"
echo ""
read -p "采样数量 (默认: 2500): " sample_count
sample_count=${sample_count:-2500}
echo ""
echo "开始准备数据集 (采样 $sample_count 个视频)..."
python3 prepare_hf_dataset.py
if [ $? -ne 0 ]; then
echo "[ERROR] 数据集准备失败"
exit 1
fi
echo ""
echo " ✓ 数据集准备完成"
# 检查数据集
echo ""
echo "========================================"
echo "验证数据集"
echo "========================================"
DATASET_PATH="/playpen-ssd/dataset/droid_raw/hg_data"
if [ ! -d "$DATASET_PATH" ]; then
echo "[ERROR] 数据集目录不存在: $DATASET_PATH"
exit 1
fi
video_count=$(ls -1 "$DATASET_PATH/videos" | wc -l)
metadata_count=$(ls -1 "$DATASET_PATH/metadata" | wc -l)
echo " 视频数量: $video_count"
echo " 元数据数量: $metadata_count"
if [ "$video_count" -ne "$metadata_count" ]; then
echo "[WARNING] 视频和元数据数量不匹配"
fi
# 计算大小
dataset_size=$(du -sh "$DATASET_PATH" | cut -f1)
echo " 数据集大小: $dataset_size"
# 上传到 HuggingFace
if [ -n "$HF_TOKEN" ]; then
echo ""
echo "========================================"
echo "上传到 HuggingFace"
echo "========================================"
echo ""
read -p "是否上传到 HuggingFace? (y/n): " upload_choice
if [ "$upload_choice" == "y" ]; then
echo ""
echo "请确认配置:"
echo " - 编辑 upload_to_huggingface.py"
echo " - 设置 HF_USERNAME (你的用户名)"
echo " - 设置 DATASET_NAME (数据集名称)"
echo ""
read -p "配置已完成,继续上传? (y/n): " confirm_upload
if [ "$confirm_upload" == "y" ]; then
python3 upload_to_huggingface.py
echo ""
echo " ✓ 上传完成"
fi
fi
else
echo ""
echo "[INFO] 跳过上传步骤(未设置 HF_TOKEN)"
fi
# 完成
echo ""
echo "========================================"
echo "完成!"
echo "========================================"
echo ""
echo "数据集位置: $DATASET_PATH"
echo ""
echo "下一步:"
echo " 1. 查看数据集: ls $DATASET_PATH"
echo " 2. 查看 README: cat $DATASET_PATH/README.md"
if [ -n "$HF_TOKEN" ]; then
echo " 3. 访问 HuggingFace 查看数据集"
else
echo " 3. 设置 HF_TOKEN 并上传到 HuggingFace"
fi
echo ""
echo "详细指南: cat UPLOAD_GUIDE.md"
|