transformers / utils /fetch_hub_objects_for_ci.py

Upload folder using huggingface_hub

a9bd396 verified about 1 month ago

10.4 kB

	import os

	import requests
	from huggingface_hub import hf_hub_download, snapshot_download

	from transformers.testing_utils import _run_pipeline_tests, _run_staging
	from transformers.utils.import_utils import is_mistral_common_available


	URLS_FOR_TESTING_DATA = [
	"http://images.cocodataset.org/val2017/000000000139.jpg",
	"http://images.cocodataset.org/val2017/000000000285.jpg",
	"http://images.cocodataset.org/val2017/000000000632.jpg",
	"http://images.cocodataset.org/val2017/000000000724.jpg",
	"http://images.cocodataset.org/val2017/000000000776.jpg",
	"http://images.cocodataset.org/val2017/000000000785.jpg",
	"http://images.cocodataset.org/val2017/000000000802.jpg",
	"http://images.cocodataset.org/val2017/000000000872.jpg",
	"http://images.cocodataset.org/val2017/000000039769.jpg",
	"https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
	"https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
	"https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
	"https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png",
	"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png",
	"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg",
	"https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav",
	"https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3",
	"https://huggingface.co/datasets/raushan-testing-hf/images_test/resolve/main/picsum_237_200x300.jpg",
	"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4",
	"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4",
	"https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png",
	"https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg",
	"https://llava-vl.github.io/static/images/view.jpg",
	"https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4",
	"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
	"https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
	"https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4",
	]


	def url_to_local_path(url, return_url_if_not_found=True):
	filename = url.split("/")[-1]

	if not os.path.exists(filename) and return_url_if_not_found:
	return url

	return filename


	if __name__ == "__main__":
	if _run_pipeline_tests:
	import datasets

	_ = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
	_ = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1")
	_ = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset")

	hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset")
	hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
	hf_hub_download(
	repo_id="hf-internal-testing/fixtures_docvqa",
	filename="nougat_pdf.png",
	repo_type="dataset",
	revision="ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db",
	)
	hf_hub_download(
	repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
	)
	hf_hub_download(
	repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
	)
	hf_hub_download(
	repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset"
	)
	hf_hub_download(
	repo_id="hf-internal-testing/spaghetti-video",
	filename="eating_spaghetti_32_frames.npy",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="hf-internal-testing/spaghetti-video",
	filename="eating_spaghetti_8_frames.npy",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
	)
	hf_hub_download(repo_id="huggyllama/llama-7b", filename="tokenizer.model")
	hf_hub_download(
	repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
	)
	hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png")
	hf_hub_download(
	repo_id="nielsr/test-image",
	filename="llava_1_6_input_ids.pt",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/test-image",
	filename="llava_1_6_pixel_values.pt",
	repo_type="dataset",
	)
	hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
	hf_hub_download(
	repo_id="raushan-testing-hf/images_test",
	filename="emu3_image.npy",
	repo_type="dataset",
	)
	hf_hub_download(repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset")
	hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
	hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset")
	hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset")
	hf_hub_download(
	repo_id="shumingh/perception_lm_test_images",
	filename="14496_0.PNG",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="shumingh/perception_lm_test_videos",
	filename="GUWR5TyiY-M_000012_000022.mp4",
	repo_type="dataset",
	)
	repo_id = "nielsr/image-segmentation-toy-data"
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="instance_segmentation_image_1.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="instance_segmentation_image_2.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="instance_segmentation_annotation_1.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="instance_segmentation_annotation_2.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="semantic_segmentation_annotation_1.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="semantic_segmentation_annotation_2.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="semantic_segmentation_image_1.png",
	repo_type="dataset",
	)
	hf_hub_download(
	repo_id="nielsr/image-segmentation-toy-data",
	filename="semantic_segmentation_image_2.png",
	repo_type="dataset",
	)
	hf_hub_download("shi-labs/oneformer_demo", "ade20k_panoptic.json", repo_type="dataset")

	hf_hub_download(
	repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset"
	)

	# Need to specify the username on the endpoint `hub-ci`, otherwise we get
	# `fatal: could not read Username for 'https://hub-ci.huggingface.co': Success`
	# But this repo. is never used in a test decorated by `is_staging_test`.
	if not _run_staging:
	if not os.path.isdir("tiny-random-custom-architecture"):
	snapshot_download(
	"hf-internal-testing/tiny-random-custom-architecture",
	local_dir="tiny-random-custom-architecture",
	)

	# For `tests/test_tokenization_mistral_common.py:TestMistralCommonBackend`, which eventually calls
	# `mistral_common.tokens.tokenizers.utils.download_tokenizer_from_hf_hub` which (probably) doesn't have the cache.
	if is_mistral_common_available():
	from mistral_common.tokens.tokenizers.mistral import MistralTokenizer

	from transformers import AutoTokenizer
	from transformers.tokenization_mistral_common import MistralCommonBackend

	repo_id = "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503"
	AutoTokenizer.from_pretrained(repo_id, tokenizer_type="mistral")
	MistralCommonBackend.from_pretrained(repo_id)
	MistralTokenizer.from_hf_hub(repo_id)

	repo_id = "mistralai/Voxtral-Mini-3B-2507"
	AutoTokenizer.from_pretrained(repo_id)
	MistralTokenizer.from_hf_hub(repo_id)

	# Download files from URLs to local directory
	for url in URLS_FOR_TESTING_DATA:
	filename = url_to_local_path(url, return_url_if_not_found=False)

	# Skip if file already exists
	if os.path.exists(filename):
	print(f"File already exists: {filename}")
	continue

	print(f"Downloading {filename}...")
	try:
	response = requests.get(url, stream=True)
	response.raise_for_status()

	with open(filename, "wb") as f:
	f.writelines(response.iter_content(chunk_size=8192))
	print(f"Successfully downloaded: {filename}")
	except requests.exceptions.RequestException as e:
	print(f"Error downloading {filename}: {e}")