File size: 3,816 Bytes
a741a7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/bin/bash
# HuggingFace 数据集准备和上传 - 快速开始脚本

set -e  # 遇到错误立即退出

echo "========================================"
echo "HuggingFace 数据集准备和上传"
echo "========================================"

# 检查必要的工具
echo ""
echo "步骤 1: 检查环境..."

if ! command -v python3 &> /dev/null; then
    echo "[ERROR] Python3 未安装"
    exit 1
fi
echo "  ✓ Python3"

if ! command -v pip &> /dev/null; then
    echo "[ERROR] pip 未安装"
    exit 1
fi
echo "  ✓ pip"

# 安装依赖
echo ""
echo "步骤 2: 安装依赖..."
pip install -q huggingface_hub datasets tqdm
echo "  ✓ 依赖已安装"

# 检查 HF_TOKEN
echo ""
echo "步骤 3: 检查 HuggingFace Token..."
if [ -z "$HF_TOKEN" ]; then
    echo "[WARNING] HF_TOKEN 环境变量未设置"
    echo ""
    echo "请设置你的 HuggingFace Token:"
    echo "  1. 访问 https://huggingface.co/settings/tokens"
    echo "  2. 创建一个 Write 权限的 token"
    echo "  3. 运行: export HF_TOKEN='your_token_here'"
    echo ""
    read -p "是否继续(仅准备数据集,不上传)? (y/n): " continue_without_token
    if [ "$continue_without_token" != "y" ]; then
        exit 1
    fi
else
    echo "  ✓ HF_TOKEN 已设置"
fi

# 准备数据集
echo ""
echo "========================================"
echo "准备数据集"
echo "========================================"
echo ""
read -p "采样数量 (默认: 2500): " sample_count
sample_count=${sample_count:-2500}

echo ""
echo "开始准备数据集 (采样 $sample_count 个视频)..."
python3 prepare_hf_dataset.py

if [ $? -ne 0 ]; then
    echo "[ERROR] 数据集准备失败"
    exit 1
fi

echo ""
echo "  ✓ 数据集准备完成"

# 检查数据集
echo ""
echo "========================================"
echo "验证数据集"
echo "========================================"

DATASET_PATH="/playpen-ssd/dataset/droid_raw/hg_data"

if [ ! -d "$DATASET_PATH" ]; then
    echo "[ERROR] 数据集目录不存在: $DATASET_PATH"
    exit 1
fi

video_count=$(ls -1 "$DATASET_PATH/videos" | wc -l)
metadata_count=$(ls -1 "$DATASET_PATH/metadata" | wc -l)

echo "  视频数量: $video_count"
echo "  元数据数量: $metadata_count"

if [ "$video_count" -ne "$metadata_count" ]; then
    echo "[WARNING] 视频和元数据数量不匹配"
fi

# 计算大小
dataset_size=$(du -sh "$DATASET_PATH" | cut -f1)
echo "  数据集大小: $dataset_size"

# 上传到 HuggingFace
if [ -n "$HF_TOKEN" ]; then
    echo ""
    echo "========================================"
    echo "上传到 HuggingFace"
    echo "========================================"
    echo ""
    read -p "是否上传到 HuggingFace? (y/n): " upload_choice

    if [ "$upload_choice" == "y" ]; then
        echo ""
        echo "请确认配置:"
        echo "  - 编辑 upload_to_huggingface.py"
        echo "  - 设置 HF_USERNAME (你的用户名)"
        echo "  - 设置 DATASET_NAME (数据集名称)"
        echo ""
        read -p "配置已完成,继续上传? (y/n): " confirm_upload

        if [ "$confirm_upload" == "y" ]; then
            python3 upload_to_huggingface.py
            echo ""
            echo "  ✓ 上传完成"
        fi
    fi
else
    echo ""
    echo "[INFO] 跳过上传步骤(未设置 HF_TOKEN)"
fi

# 完成
echo ""
echo "========================================"
echo "完成!"
echo "========================================"
echo ""
echo "数据集位置: $DATASET_PATH"
echo ""
echo "下一步:"
echo "  1. 查看数据集: ls $DATASET_PATH"
echo "  2. 查看 README: cat $DATASET_PATH/README.md"
if [ -n "$HF_TOKEN" ]; then
    echo "  3. 访问 HuggingFace 查看数据集"
else
    echo "  3. 设置 HF_TOKEN 并上传到 HuggingFace"
fi
echo ""
echo "详细指南: cat UPLOAD_GUIDE.md"