| | import os |
| |
|
| | import requests |
| | from huggingface_hub import hf_hub_download, snapshot_download |
| |
|
| | from transformers.testing_utils import _run_pipeline_tests, _run_staging |
| | from transformers.utils.import_utils import is_mistral_common_available |
| |
|
| |
|
| | URLS_FOR_TESTING_DATA = [ |
| | "http://images.cocodataset.org/val2017/000000000139.jpg", |
| | "http://images.cocodataset.org/val2017/000000000285.jpg", |
| | "http://images.cocodataset.org/val2017/000000000632.jpg", |
| | "http://images.cocodataset.org/val2017/000000000724.jpg", |
| | "http://images.cocodataset.org/val2017/000000000776.jpg", |
| | "http://images.cocodataset.org/val2017/000000000785.jpg", |
| | "http://images.cocodataset.org/val2017/000000000802.jpg", |
| | "http://images.cocodataset.org/val2017/000000000872.jpg", |
| | "http://images.cocodataset.org/val2017/000000039769.jpg", |
| | "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg", |
| | "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", |
| | "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3", |
| | "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png", |
| | "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4", |
| | "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png", |
| | "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/australia.jpg", |
| | "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/f2641_0_throatclearing.wav", |
| | "https://huggingface.co/datasets/raushan-testing-hf/audio-test/resolve/main/glass-breaking-151256.mp3", |
| | "https://huggingface.co/datasets/raushan-testing-hf/images_test/resolve/main/picsum_237_200x300.jpg", |
| | "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/Big_Buck_Bunny_720_10s_10MB.mp4", |
| | "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/sample_demo_1.mp4", |
| | "https://huggingface.co/microsoft/kosmos-2.5/resolve/main/receipt_00008.png", |
| | "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/two_dogs.jpg", |
| | "https://llava-vl.github.io/static/images/view.jpg", |
| | "https://huggingface.co/datasets/hf-internal-testing/fixtures_videos/resolve/main/tennis.mp4", |
| | "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", |
| | "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg", |
| | "https://huggingface.co/datasets/raushan-testing-hf/videos-test/resolve/main/tiny_video.mp4", |
| | ] |
| |
|
| |
|
| | def url_to_local_path(url, return_url_if_not_found=True): |
| | filename = url.split("/")[-1] |
| |
|
| | if not os.path.exists(filename) and return_url_if_not_found: |
| | return url |
| |
|
| | return filename |
| |
|
| |
|
| | if __name__ == "__main__": |
| | if _run_pipeline_tests: |
| | import datasets |
| |
|
| | _ = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") |
| | _ = datasets.load_dataset("hf-internal-testing/fixtures_image_utils", split="test", revision="refs/pr/1") |
| | _ = hf_hub_download(repo_id="nateraw/video-demo", filename="archery.mp4", repo_type="dataset") |
| |
|
| | hf_hub_download("Narsil/asr_dummy", filename="hindi.ogg", repo_type="dataset") |
| | hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt") |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/fixtures_docvqa", |
| | filename="nougat_pdf.png", |
| | repo_type="dataset", |
| | revision="ec57bf8c8b1653a209c13f6e9ee66b12df0fc2db", |
| | ) |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset" |
| | ) |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset" |
| | ) |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy", repo_type="dataset" |
| | ) |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/spaghetti-video", |
| | filename="eating_spaghetti_32_frames.npy", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/spaghetti-video", |
| | filename="eating_spaghetti_8_frames.npy", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset" |
| | ) |
| | hf_hub_download(repo_id="huggyllama/llama-7b", filename="tokenizer.model") |
| | hf_hub_download( |
| | repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset" |
| | ) |
| | hf_hub_download(repo_id="nielsr/example-pdf", repo_type="dataset", filename="example_pdf.png") |
| | hf_hub_download( |
| | repo_id="nielsr/test-image", |
| | filename="llava_1_6_input_ids.pt", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/test-image", |
| | filename="llava_1_6_pixel_values.pt", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset") |
| | hf_hub_download( |
| | repo_id="raushan-testing-hf/images_test", |
| | filename="emu3_image.npy", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download(repo_id="raushan-testing-hf/images_test", filename="llava_v1_5_radar.jpg", repo_type="dataset") |
| | hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") |
| | hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo.npy", repo_type="dataset") |
| | hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="video_demo_2.npy", repo_type="dataset") |
| | hf_hub_download( |
| | repo_id="shumingh/perception_lm_test_images", |
| | filename="14496_0.PNG", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="shumingh/perception_lm_test_videos", |
| | filename="GUWR5TyiY-M_000012_000022.mp4", |
| | repo_type="dataset", |
| | ) |
| | repo_id = "nielsr/image-segmentation-toy-data" |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="instance_segmentation_image_1.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="instance_segmentation_image_2.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="instance_segmentation_annotation_1.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="instance_segmentation_annotation_2.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="semantic_segmentation_annotation_1.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="semantic_segmentation_annotation_2.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="semantic_segmentation_image_1.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download( |
| | repo_id="nielsr/image-segmentation-toy-data", |
| | filename="semantic_segmentation_image_2.png", |
| | repo_type="dataset", |
| | ) |
| | hf_hub_download("shi-labs/oneformer_demo", "ade20k_panoptic.json", repo_type="dataset") |
| |
|
| | hf_hub_download( |
| | repo_id="nielsr/audio-spectogram-transformer-checkpoint", filename="sample_audio.flac", repo_type="dataset" |
| | ) |
| |
|
| | |
| | |
| | |
| | if not _run_staging: |
| | if not os.path.isdir("tiny-random-custom-architecture"): |
| | snapshot_download( |
| | "hf-internal-testing/tiny-random-custom-architecture", |
| | local_dir="tiny-random-custom-architecture", |
| | ) |
| |
|
| | |
| | |
| | if is_mistral_common_available(): |
| | from mistral_common.tokens.tokenizers.mistral import MistralTokenizer |
| |
|
| | from transformers import AutoTokenizer |
| | from transformers.tokenization_mistral_common import MistralCommonBackend |
| |
|
| | repo_id = "hf-internal-testing/namespace-mistralai-repo_name-Mistral-Small-3.1-24B-Instruct-2503" |
| | AutoTokenizer.from_pretrained(repo_id, tokenizer_type="mistral") |
| | MistralCommonBackend.from_pretrained(repo_id) |
| | MistralTokenizer.from_hf_hub(repo_id) |
| |
|
| | repo_id = "mistralai/Voxtral-Mini-3B-2507" |
| | AutoTokenizer.from_pretrained(repo_id) |
| | MistralTokenizer.from_hf_hub(repo_id) |
| |
|
| | |
| | for url in URLS_FOR_TESTING_DATA: |
| | filename = url_to_local_path(url, return_url_if_not_found=False) |
| |
|
| | |
| | if os.path.exists(filename): |
| | print(f"File already exists: {filename}") |
| | continue |
| |
|
| | print(f"Downloading {filename}...") |
| | try: |
| | response = requests.get(url, stream=True) |
| | response.raise_for_status() |
| |
|
| | with open(filename, "wb") as f: |
| | f.writelines(response.iter_content(chunk_size=8192)) |
| | print(f"Successfully downloaded: {filename}") |
| | except requests.exceptions.RequestException as e: |
| | print(f"Error downloading {filename}: {e}") |
| |
|