ICL / RL_dataset /download_scienceqa_hf.sh
Lekr0's picture
Add files using upload-large-folder tool
90afcf2 verified
#!/usr/bin/env bash
set -euo pipefail
MODE="${1:-all}"
REPO_ID="derek-thomas/ScienceQA"
ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA"
HF_DIR="${ROOT_DIR}/hf"
IMG_DIR="${ROOT_DIR}/images"
CACHE_DIR="${ROOT_DIR}/.hf_cache"
DEFAULT_ENDPOINT="https://hf-mirror.com"
HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"
unset http_proxy
unset https_proxy
unset HTTP_PROXY
unset HTTPS_PROXY
unset all_proxy
unset ALL_PROXY
export HF_ENDPOINT="${HF_ENDPOINT_VALUE}"
mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}"
if command -v hf >/dev/null 2>&1; then
HF_BIN=(hf download)
elif command -v huggingface-cli >/dev/null 2>&1; then
HF_BIN=(huggingface-cli download)
else
echo "Missing Hugging Face CLI. Install it with:" >&2
echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2
exit 1
fi
print_help() {
cat <<'EOF'
Usage:
bash download_scienceqa_hf.sh [parquet|images|all]
Modes:
parquet Download the public Hugging Face parquet files only
images Download the original ScienceQA image zip files only
all Download both parquet files and images
Output layout:
/workspace/xiaobin/RL_dataset/data/ScienceQA/hf
/workspace/xiaobin/RL_dataset/data/ScienceQA/images
Notes:
- This dataset is public and should not require an HF token.
- Image URLs are adapted from:
/workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh
- Proxies are unset before download.
- Default HF endpoint: https://hf-mirror.com
- To override and use the official endpoint:
HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet
EOF
}
if [[ "${MODE}" == "-h" || "${MODE}" == "--help" || "${MODE}" == "help" ]]; then
print_help
exit 0
fi
verify_glob() {
local pattern="$1"
if ! compgen -G "${pattern}" >/dev/null; then
echo "Missing expected file matching: ${pattern}" >&2
exit 1
fi
}
download_parquet() {
"${HF_BIN[@]}" "${REPO_ID}" \
--repo-type dataset \
--cache-dir "${CACHE_DIR}" \
--local-dir "${HF_DIR}" \
--include "data/*.parquet" \
--include "README.md" \
--include "ScienceQA.py"
verify_glob "${HF_DIR}/data/train-*.parquet"
verify_glob "${HF_DIR}/data/validation-*.parquet"
verify_glob "${HF_DIR}/data/test-*.parquet"
}
download_one_split() {
local split="$1"
local zip_path="${IMG_DIR}/${split}.zip"
local split_dir="${IMG_DIR}/${split}"
local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip"
if [[ -d "${split_dir}" ]]; then
echo "Image split already exists: ${split_dir}"
return 0
fi
wget -c -O "${zip_path}" "${url}"
unzip -q -o "${zip_path}" -d "${IMG_DIR}"
rm -f "${zip_path}"
if [[ ! -d "${split_dir}" ]]; then
echo "Failed to extract image split: ${split}" >&2
exit 1
fi
}
download_images() {
download_one_split train
download_one_split val
download_one_split test
}
case "${MODE}" in
parquet)
download_parquet
;;
images)
download_images
;;
all)
download_parquet
download_images
;;
*)
echo "Unknown mode: ${MODE}" >&2
print_help >&2
exit 1
;;
esac
echo "Download completed."
echo "Parquet dir: ${HF_DIR}"
echo "Image dir: ${IMG_DIR}"