Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script to fetch the 10 most used evaluation datasets from Hugging Face. | |
| """ | |
| import requests | |
| from typing import List, Dict | |
| def get_popular_eval_datasets(limit: int = 10) -> List[Dict]: | |
| """ | |
| Fetch popular evaluation datasets from Hugging Face Hub API. | |
| Args: | |
| limit: Number of datasets to return | |
| Returns: | |
| List of dataset information dictionaries | |
| """ | |
| # Common evaluation dataset tags and keywords | |
| eval_keywords = [ | |
| "evaluation", "benchmark", "eval", "test-set", "validation", | |
| "leaderboard", "assessment", "metric" | |
| ] | |
| # Search for datasets with evaluation-related tags | |
| base_url = "https://huggingface.co/api/datasets" | |
| params = { | |
| "sort": "downloads", # Sort by most downloaded | |
| "direction": "-1", # Descending order | |
| "limit": 100, # Get more to filter | |
| "full": "true" | |
| } | |
| response = requests.get(base_url, params=params) | |
| response.raise_for_status() | |
| datasets = response.json() | |
| # Filter for evaluation datasets | |
| eval_datasets = [] | |
| for dataset in datasets: | |
| # Check if dataset has evaluation-related tags or is commonly used for eval | |
| tags = dataset.get("tags", []) | |
| dataset_id = dataset.get("id", "").lower() | |
| # Check for eval keywords in tags or dataset name | |
| is_eval = any( | |
| any(keyword in str(tag).lower() for keyword in eval_keywords) | |
| for tag in tags | |
| ) or any(keyword in dataset_id for keyword in eval_keywords) | |
| # Also include well-known evaluation datasets | |
| known_eval_datasets = [ | |
| "glue", "superglue", "squad", "xnli", "hellaswag", "winogrande", | |
| "arc", "mmlu", "gsm8k", "humaneval", "mbpp", "truthfulqa", | |
| "bigbench", "c4", "piqa", "siqa", "boolq", "copa", "multirc", | |
| "record", "rte", "wic", "wsc", "cb", "axb", "axg", "swag", | |
| "race", "qnli", "wnli", "sst", "cola", "stsb", "mrpc", "qqp" | |
| ] | |
| if any(known in dataset_id for known in known_eval_datasets): | |
| is_eval = True | |
| if is_eval: | |
| eval_datasets.append({ | |
| "name": dataset.get("id", ""), | |
| "downloads": dataset.get("downloads", 0), | |
| "likes": dataset.get("likes", 0), | |
| "tags": [tag for tag in tags if isinstance(tag, str)][:5], # First 5 tags | |
| "description": dataset.get("description", "")[:200] # First 200 chars | |
| }) | |
| # Sort by downloads and return top N | |
| eval_datasets.sort(key=lambda x: x["downloads"], reverse=True) | |
| return eval_datasets[:limit] | |
| def main(): | |
| """Main function to fetch and display popular evaluation datasets.""" | |
| print("Fetching the 10 most used evaluation datasets from Hugging Face...\n") | |
| try: | |
| datasets = get_popular_eval_datasets(10) | |
| for i, dataset in enumerate(datasets, 1): | |
| print(f"{i}. {dataset['name']}") | |
| print(f" Downloads: {dataset['downloads']:,}") | |
| print(f" Likes: {dataset['likes']}") | |
| if dataset['tags']: | |
| print(f" Tags: {', '.join(dataset['tags'])}") | |
| if dataset['description']: | |
| print(f" Description: {dataset['description']}...") | |
| print() | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching data from Hugging Face: {e}") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| if __name__ == "__main__": | |
| main() |