| #!/bin/bash |
| |
|
|
| set -e |
|
|
| echo "========================================" |
| echo "HuggingFace 数据集准备和上传" |
| echo "========================================" |
|
|
| |
| echo "" |
| echo "步骤 1: 检查环境..." |
|
|
| if ! command -v python3 &> /dev/null; then |
| echo "[ERROR] Python3 未安装" |
| exit 1 |
| fi |
| echo " ✓ Python3" |
|
|
| if ! command -v pip &> /dev/null; then |
| echo "[ERROR] pip 未安装" |
| exit 1 |
| fi |
| echo " ✓ pip" |
|
|
| |
| echo "" |
| echo "步骤 2: 安装依赖..." |
| pip install -q huggingface_hub datasets tqdm |
| echo " ✓ 依赖已安装" |
|
|
| |
| echo "" |
| echo "步骤 3: 检查 HuggingFace Token..." |
| if [ -z "$HF_TOKEN" ]; then |
| echo "[WARNING] HF_TOKEN 环境变量未设置" |
| echo "" |
| echo "请设置你的 HuggingFace Token:" |
| echo " 1. 访问 https://huggingface.co/settings/tokens" |
| echo " 2. 创建一个 Write 权限的 token" |
| echo " 3. 运行: export HF_TOKEN='your_token_here'" |
| echo "" |
| read -p "是否继续(仅准备数据集,不上传)? (y/n): " continue_without_token |
| if [ "$continue_without_token" != "y" ]; then |
| exit 1 |
| fi |
| else |
| echo " ✓ HF_TOKEN 已设置" |
| fi |
|
|
| |
| echo "" |
| echo "========================================" |
| echo "准备数据集" |
| echo "========================================" |
| echo "" |
| read -p "采样数量 (默认: 2500): " sample_count |
| sample_count=${sample_count:-2500} |
|
|
| echo "" |
| echo "开始准备数据集 (采样 $sample_count 个视频)..." |
| python3 prepare_hf_dataset.py |
|
|
| if [ $? -ne 0 ]; then |
| echo "[ERROR] 数据集准备失败" |
| exit 1 |
| fi |
|
|
| echo "" |
| echo " ✓ 数据集准备完成" |
|
|
| |
| echo "" |
| echo "========================================" |
| echo "验证数据集" |
| echo "========================================" |
|
|
| DATASET_PATH="/playpen-ssd/dataset/droid_raw/hg_data" |
|
|
| if [ ! -d "$DATASET_PATH" ]; then |
| echo "[ERROR] 数据集目录不存在: $DATASET_PATH" |
| exit 1 |
| fi |
|
|
| video_count=$(ls -1 "$DATASET_PATH/videos" | wc -l) |
| metadata_count=$(ls -1 "$DATASET_PATH/metadata" | wc -l) |
|
|
| echo " 视频数量: $video_count" |
| echo " 元数据数量: $metadata_count" |
|
|
| if [ "$video_count" -ne "$metadata_count" ]; then |
| echo "[WARNING] 视频和元数据数量不匹配" |
| fi |
|
|
| |
| dataset_size=$(du -sh "$DATASET_PATH" | cut -f1) |
| echo " 数据集大小: $dataset_size" |
|
|
| |
| if [ -n "$HF_TOKEN" ]; then |
| echo "" |
| echo "========================================" |
| echo "上传到 HuggingFace" |
| echo "========================================" |
| echo "" |
| read -p "是否上传到 HuggingFace? (y/n): " upload_choice |
|
|
| if [ "$upload_choice" == "y" ]; then |
| echo "" |
| echo "请确认配置:" |
| echo " - 编辑 upload_to_huggingface.py" |
| echo " - 设置 HF_USERNAME (你的用户名)" |
| echo " - 设置 DATASET_NAME (数据集名称)" |
| echo "" |
| read -p "配置已完成,继续上传? (y/n): " confirm_upload |
|
|
| if [ "$confirm_upload" == "y" ]; then |
| python3 upload_to_huggingface.py |
| echo "" |
| echo " ✓ 上传完成" |
| fi |
| fi |
| else |
| echo "" |
| echo "[INFO] 跳过上传步骤(未设置 HF_TOKEN)" |
| fi |
|
|
| |
| echo "" |
| echo "========================================" |
| echo "完成!" |
| echo "========================================" |
| echo "" |
| echo "数据集位置: $DATASET_PATH" |
| echo "" |
| echo "下一步:" |
| echo " 1. 查看数据集: ls $DATASET_PATH" |
| echo " 2. 查看 README: cat $DATASET_PATH/README.md" |
| if [ -n "$HF_TOKEN" ]; then |
| echo " 3. 访问 HuggingFace 查看数据集" |
| else |
| echo " 3. 设置 HF_TOKEN 并上传到 HuggingFace" |
| fi |
| echo "" |
| echo "详细指南: cat UPLOAD_GUIDE.md" |
|
|