errm / tools /quick_start.sh
yuffish's picture
Add files using upload-large-folder tool
a741a7c verified
#!/bin/bash
# HuggingFace 数据集准备和上传 - 快速开始脚本
set -e # 遇到错误立即退出
echo "========================================"
echo "HuggingFace 数据集准备和上传"
echo "========================================"
# 检查必要的工具
echo ""
echo "步骤 1: 检查环境..."
if ! command -v python3 &> /dev/null; then
echo "[ERROR] Python3 未安装"
exit 1
fi
echo " ✓ Python3"
if ! command -v pip &> /dev/null; then
echo "[ERROR] pip 未安装"
exit 1
fi
echo " ✓ pip"
# 安装依赖
echo ""
echo "步骤 2: 安装依赖..."
pip install -q huggingface_hub datasets tqdm
echo " ✓ 依赖已安装"
# 检查 HF_TOKEN
echo ""
echo "步骤 3: 检查 HuggingFace Token..."
if [ -z "$HF_TOKEN" ]; then
echo "[WARNING] HF_TOKEN 环境变量未设置"
echo ""
echo "请设置你的 HuggingFace Token:"
echo " 1. 访问 https://huggingface.co/settings/tokens"
echo " 2. 创建一个 Write 权限的 token"
echo " 3. 运行: export HF_TOKEN='your_token_here'"
echo ""
read -p "是否继续(仅准备数据集,不上传)? (y/n): " continue_without_token
if [ "$continue_without_token" != "y" ]; then
exit 1
fi
else
echo " ✓ HF_TOKEN 已设置"
fi
# 准备数据集
echo ""
echo "========================================"
echo "准备数据集"
echo "========================================"
echo ""
read -p "采样数量 (默认: 2500): " sample_count
sample_count=${sample_count:-2500}
echo ""
echo "开始准备数据集 (采样 $sample_count 个视频)..."
python3 prepare_hf_dataset.py
if [ $? -ne 0 ]; then
echo "[ERROR] 数据集准备失败"
exit 1
fi
echo ""
echo " ✓ 数据集准备完成"
# 检查数据集
echo ""
echo "========================================"
echo "验证数据集"
echo "========================================"
DATASET_PATH="/playpen-ssd/dataset/droid_raw/hg_data"
if [ ! -d "$DATASET_PATH" ]; then
echo "[ERROR] 数据集目录不存在: $DATASET_PATH"
exit 1
fi
video_count=$(ls -1 "$DATASET_PATH/videos" | wc -l)
metadata_count=$(ls -1 "$DATASET_PATH/metadata" | wc -l)
echo " 视频数量: $video_count"
echo " 元数据数量: $metadata_count"
if [ "$video_count" -ne "$metadata_count" ]; then
echo "[WARNING] 视频和元数据数量不匹配"
fi
# 计算大小
dataset_size=$(du -sh "$DATASET_PATH" | cut -f1)
echo " 数据集大小: $dataset_size"
# 上传到 HuggingFace
if [ -n "$HF_TOKEN" ]; then
echo ""
echo "========================================"
echo "上传到 HuggingFace"
echo "========================================"
echo ""
read -p "是否上传到 HuggingFace? (y/n): " upload_choice
if [ "$upload_choice" == "y" ]; then
echo ""
echo "请确认配置:"
echo " - 编辑 upload_to_huggingface.py"
echo " - 设置 HF_USERNAME (你的用户名)"
echo " - 设置 DATASET_NAME (数据集名称)"
echo ""
read -p "配置已完成,继续上传? (y/n): " confirm_upload
if [ "$confirm_upload" == "y" ]; then
python3 upload_to_huggingface.py
echo ""
echo " ✓ 上传完成"
fi
fi
else
echo ""
echo "[INFO] 跳过上传步骤(未设置 HF_TOKEN)"
fi
# 完成
echo ""
echo "========================================"
echo "完成!"
echo "========================================"
echo ""
echo "数据集位置: $DATASET_PATH"
echo ""
echo "下一步:"
echo " 1. 查看数据集: ls $DATASET_PATH"
echo " 2. 查看 README: cat $DATASET_PATH/README.md"
if [ -n "$HF_TOKEN" ]; then
echo " 3. 访问 HuggingFace 查看数据集"
else
echo " 3. 设置 HF_TOKEN 并上传到 HuggingFace"
fi
echo ""
echo "详细指南: cat UPLOAD_GUIDE.md"