| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| MODE="${1:-all}" |
|
|
| REPO_ID="derek-thomas/ScienceQA" |
| ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA" |
| HF_DIR="${ROOT_DIR}/hf" |
| IMG_DIR="${ROOT_DIR}/images" |
| CACHE_DIR="${ROOT_DIR}/.hf_cache" |
| DEFAULT_ENDPOINT="https://hf-mirror.com" |
| HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}" |
|
|
| unset http_proxy |
| unset https_proxy |
| unset HTTP_PROXY |
| unset HTTPS_PROXY |
| unset all_proxy |
| unset ALL_PROXY |
|
|
| export HF_ENDPOINT="${HF_ENDPOINT_VALUE}" |
|
|
| mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}" |
|
|
| if command -v hf >/dev/null 2>&1; then |
| HF_BIN=(hf download) |
| elif command -v huggingface-cli >/dev/null 2>&1; then |
| HF_BIN=(huggingface-cli download) |
| else |
| echo "Missing Hugging Face CLI. Install it with:" >&2 |
| echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2 |
| exit 1 |
| fi |
|
|
| print_help() { |
| cat <<'EOF' |
| Usage: |
| bash download_scienceqa_hf.sh [parquet|images|all] |
|
|
| Modes: |
| parquet Download the public Hugging Face parquet files only |
| images Download the original ScienceQA image zip files only |
| all Download both parquet files and images |
|
|
| Output layout: |
| /workspace/xiaobin/RL_dataset/data/ScienceQA/hf |
| /workspace/xiaobin/RL_dataset/data/ScienceQA/images |
|
|
| Notes: |
| - This dataset is public and should not require an HF token. |
| - Image URLs are adapted from: |
| /workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh |
| - Proxies are unset before download. |
| - Default HF endpoint: https://hf-mirror.com |
| - To override and use the official endpoint: |
| HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet |
| EOF |
| } |
|
|
| if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then |
| print_help |
| exit 0 |
| fi |
|
|
| verify_glob() { |
| local pattern="$1" |
|
|
| if ! compgen -G "${pattern}" >/dev/null; then |
| echo "Missing expected file matching: ${pattern}" >&2 |
| exit 1 |
| fi |
| } |
|
|
| download_parquet() { |
| "${HF_BIN[@]}" "${REPO_ID}" \ |
| --repo-type dataset \ |
| --cache-dir "${CACHE_DIR}" \ |
| --local-dir "${HF_DIR}" \ |
| --include "data/*.parquet" \ |
| --include "README.md" \ |
| --include "ScienceQA.py" |
|
|
| verify_glob "${HF_DIR}/data/train-*.parquet" |
| verify_glob "${HF_DIR}/data/validation-*.parquet" |
| verify_glob "${HF_DIR}/data/test-*.parquet" |
| } |
|
|
| download_one_split() { |
| local split="$1" |
| local zip_path="${IMG_DIR}/${split}.zip" |
| local split_dir="${IMG_DIR}/${split}" |
| local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip" |
|
|
| if [[ -d "${split_dir}" ]]; then |
| echo "Image split already exists: ${split_dir}" |
| return 0 |
| fi |
|
|
| wget -c -O "${zip_path}" "${url}" |
| unzip -q -o "${zip_path}" -d "${IMG_DIR}" |
| rm -f "${zip_path}" |
|
|
| if [[ ! -d "${split_dir}" ]]; then |
| echo "Failed to extract image split: ${split}" >&2 |
| exit 1 |
| fi |
| } |
|
|
| download_images() { |
| download_one_split train |
| download_one_split val |
| download_one_split test |
| } |
|
|
| case "${MODE}" in |
| parquet) |
| download_parquet |
| ;; |
| images) |
| download_images |
| ;; |
| all) |
| download_parquet |
| download_images |
| ;; |
| *) |
| echo "Unknown mode: ${MODE}" >&2 |
| print_help >&2 |
| exit 1 |
| ;; |
| esac |
|
|
| echo "Download completed." |
| echo "Parquet dir: ${HF_DIR}" |
| echo "Image dir: ${IMG_DIR}" |
|
|