InContextLearning
/

ICL

Model card Files Files and versions

ICL / RL_dataset /download_scienceqa_hf.sh

Lekr0's picture

Add files using upload-large-folder tool

90afcf2 verified about 1 month ago

history blame contribute delete

3.22 kB

	#!/usr/bin/env bash
	set -euo pipefail

	MODE="${1:-all}"

	REPO_ID="derek-thomas/ScienceQA"
	ROOT_DIR="/workspace/xiaobin/RL_dataset/data/ScienceQA"
	HF_DIR="${ROOT_DIR}/hf"
	IMG_DIR="${ROOT_DIR}/images"
	CACHE_DIR="${ROOT_DIR}/.hf_cache"
	DEFAULT_ENDPOINT="https://hf-mirror.com"
	HF_ENDPOINT_VALUE="${HF_ENDPOINT:-${HF_ENDPOINT_OVERRIDE:-${DEFAULT_ENDPOINT}}}"

	unset http_proxy
	unset https_proxy
	unset HTTP_PROXY
	unset HTTPS_PROXY
	unset all_proxy
	unset ALL_PROXY

	export HF_ENDPOINT="${HF_ENDPOINT_VALUE}"

	mkdir -p "${HF_DIR}" "${IMG_DIR}" "${CACHE_DIR}"

	if command -v hf >/dev/null 2>&1; then
	HF_BIN=(hf download)
	elif command -v huggingface-cli >/dev/null 2>&1; then
	HF_BIN=(huggingface-cli download)
	else
	echo "Missing Hugging Face CLI. Install it with:" >&2
	echo " python -m pip install -U \"huggingface_hub[cli]\"" >&2
	exit 1
	fi

	print_help() {
	cat <<'EOF'
	Usage:
	bash download_scienceqa_hf.sh [parquet\|images\|all]

	Modes:
	parquet Download the public Hugging Face parquet files only
	images Download the original ScienceQA image zip files only
	all Download both parquet files and images

	Output layout:
	/workspace/xiaobin/RL_dataset/data/ScienceQA/hf
	/workspace/xiaobin/RL_dataset/data/ScienceQA/images

	Notes:
	- This dataset is public and should not require an HF token.
	- Image URLs are adapted from:
	/workspace/xiaobin/RL_dataset/ScienceQA/tools/download.sh
	- Proxies are unset before download.
	- Default HF endpoint: https://hf-mirror.com
	- To override and use the official endpoint:
	HF_ENDPOINT=https://huggingface.co bash download_scienceqa_hf.sh parquet
	EOF
	}

	if [[ "${MODE}" == "-h" \|\| "${MODE}" == "--help" \|\| "${MODE}" == "help" ]]; then
	print_help
	exit 0
	fi

	verify_glob() {
	local pattern="$1"

	if ! compgen -G "${pattern}" >/dev/null; then
	echo "Missing expected file matching: ${pattern}" >&2
	exit 1
	fi
	}

	download_parquet() {
	"${HF_BIN[@]}" "${REPO_ID}" \
	--repo-type dataset \
	--cache-dir "${CACHE_DIR}" \
	--local-dir "${HF_DIR}" \
	--include "data/*.parquet" \
	--include "README.md" \
	--include "ScienceQA.py"

	verify_glob "${HF_DIR}/data/train-*.parquet"
	verify_glob "${HF_DIR}/data/validation-*.parquet"
	verify_glob "${HF_DIR}/data/test-*.parquet"
	}

	download_one_split() {
	local split="$1"
	local zip_path="${IMG_DIR}/${split}.zip"
	local split_dir="${IMG_DIR}/${split}"
	local url="https://scienceqa.s3.us-west-1.amazonaws.com/images/${split}.zip"

	if [[ -d "${split_dir}" ]]; then
	echo "Image split already exists: ${split_dir}"
	return 0
	fi

	wget -c -O "${zip_path}" "${url}"
	unzip -q -o "${zip_path}" -d "${IMG_DIR}"
	rm -f "${zip_path}"

	if [[ ! -d "${split_dir}" ]]; then
	echo "Failed to extract image split: ${split}" >&2
	exit 1
	fi
	}

	download_images() {
	download_one_split train
	download_one_split val
	download_one_split test
	}

	case "${MODE}" in
	parquet)
	download_parquet
	;;
	images)
	download_images
	;;
	all)
	download_parquet
	download_images
	;;
	*)
	echo "Unknown mode: ${MODE}" >&2
	print_help >&2
	exit 1
	;;
	esac

	echo "Download completed."
	echo "Parquet dir: ${HF_DIR}"
	echo "Image dir: ${IMG_DIR}"