File size: 3,219 Bytes
90afcf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env bash
set -euo pipefail

MODE="${1:-all}"

REPO_ID="derek-thomas/ScienceQA"
ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA"
HF_DIR="${ROOT_DIR}/hf"
IMG_DIR="${ROOT_DIR}/images"
CACHE_DIR="${ROOT_DIR}/.hf_cache"
DEFAULT_ENDPOINT="https://hf-mirror.com"
HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"

unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
unset all_proxy
unset ALL_PROXY

export HF_ENDPOINT="${HF_ENDPOINT_VALUE}"

mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}"

if command -v hf >/dev/null 2>&1; then
  HF_BIN=(hf download)
elif command -v huggingface-cli >/dev/null 2>&1; then
  HF_BIN=(huggingface-cli download)
else
  echo "Missing Hugging Face CLI. Install it with:" >&2
  echo "  python -m pip install -U \"huggingface_hub[cli]\"" >&2
  exit 1
fi

print_help() {
  cat <<'EOF'
Usage:
  bash download_scienceqa_hf.sh [parquet|images|all]

Modes:
  parquet  Download the public Hugging Face parquet files only
  images   Download the original ScienceQA image zip files only
  all      Download both parquet files and images

Output layout:
  /workspace/xiaobin/RL_dataset/data/ScienceQA/hf
  /workspace/xiaobin/RL_dataset/data/ScienceQA/images

Notes:
  - This dataset is public and should not require an HF token.
  - Image URLs are adapted from:
    /workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh
  - Proxies are unset before download.
  - Default HF endpoint: https://hf-mirror.com
  - To override and use the official endpoint:
    HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet
EOF
}

if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then
  print_help
  exit 0
fi

verify_glob() {
  local pattern="$1"

  if ! compgen -G "${pattern}" >/dev/null; then
    echo "Missing expected file matching: ${pattern}" >&2
    exit 1
  fi
}

download_parquet() {
  "${HF_BIN[@]}" "${REPO_ID}" \
    --repo-type dataset \
    --cache-dir "${CACHE_DIR}" \
    --local-dir "${HF_DIR}" \
    --include "data/*.parquet" \
    --include "README.md" \
    --include "ScienceQA.py"

  verify_glob "${HF_DIR}/data/train-*.parquet"
  verify_glob "${HF_DIR}/data/validation-*.parquet"
  verify_glob "${HF_DIR}/data/test-*.parquet"
}

download_one_split() {
  local split="$1"
  local zip_path="${IMG_DIR}/${split}.zip"
  local split_dir="${IMG_DIR}/${split}"
  local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip"

  if [[ -d "${split_dir}" ]]; then
    echo "Image split already exists: ${split_dir}"
    return 0
  fi

  wget -c -O "${zip_path}" "${url}"
  unzip -q -o "${zip_path}" -d "${IMG_DIR}"
  rm -f "${zip_path}"

  if [[ ! -d "${split_dir}" ]]; then
    echo "Failed to extract image split: ${split}" >&2
    exit 1
  fi
}

download_images() {
  download_one_split train
  download_one_split val
  download_one_split test
}

case "${MODE}" in
  parquet)
    download_parquet
    ;;
  images)
    download_images
    ;;
  all)
    download_parquet
    download_images
    ;;
  *)
    echo "Unknown mode: ${MODE}" >&2
    print_help >&2
    exit 1
    ;;
esac

echo "Download completed."
echo "Parquet dir: ${HF_DIR}"
echo "Image dir:   ${IMG_DIR}"