In [1]:
import requests
from typing import List, Dict, Any, Iterator

class DatasetSearchClient:
    def __init__(self, base_url: str = "https://librarian-bots-dataset-column-search-api.hf.space"):
        self.base_url = base_url

    def search(self,
               columns: List[str],
               match_all: bool = False,
               page_size: int = 100) -> Iterator[Dict[str, Any]]:
        """
        Search datasets using the provided API, automatically handling pagination.

        Args:
            columns (List[str]): List of column names to search for.
            match_all (bool, optional): If True, match all columns. If False, match any column. Defaults to False.
            page_size (int, optional): Number of results per page. Defaults to 100.

        Yields:
            Dict[str, Any]: Each dataset result from all pages.

        Raises:
            requests.RequestException: If there's an error with the HTTP request.
            ValueError: If the API returns an unexpected response format.
        """
        page = 1
        total_results = None

        while total_results is None or (page - 1) * page_size < total_results:
            params = {
                "columns": columns,
                "match_all": str(match_all).lower(),
                "page": page,
                "page_size": page_size
            }

            try:
                response = requests.get(f"{self.base_url}/search", params=params)
                response.raise_for_status()
                data = response.json()

                if not {"total", "page", "page_size", "results"}.issubset(data.keys()):
                    raise ValueError("Unexpected response format from the API")

                if total_results is None:
                    total_results = data['total']

                for dataset in data['results']:
                    yield dataset

                page += 1

            except requests.RequestException as e:
                raise requests.RequestException(f"Error connecting to the API: {str(e)}")
            except ValueError as e:
                raise ValueError(f"Error processing API response: {str(e)}")

# Create an instance of the client
client = DatasetSearchClient()

In [5]:
results = list(client.search(['tools'],match_all=True))
len(results)

38

In [6]:
results[0]

{'hub_id': 'llamafactory/glaive_toolcall_en',
 'likes': 1,
 'downloads': 1151,
 'tags': ['task_categories:text-generation',
  'task_categories:question-answering',
  'language:en',
  'license:apache-2.0',
  'size_categories:1K<n<10K',
  'json',
  'text',
  'datasets',
  'mlcroissant',
  'region:us',
  'llama-factory',
  'croissant'],
 'created_at': 1715955540,
 'last_modified': 1717785919,
 'license': ['apache-2.0'],
 'language': ['en'],
 'config_name': 'default',
 'column_names': ['conversations', 'tools'],
 'features': [{'name': 'conversations',
   'list': [{'name': 'from', 'dtype': 'string'},
    {'name': 'value', 'dtype': 'string'}]},
  {'name': 'tools', 'dtype': 'string'}],
 'match_count': 1}

In [9]:
from huggingface_hub import create_collection, add_collection_item

In [11]:
collection = create_collection("Probably function calling datasets", namespace="librarian-bots",)

In [12]:
collection.slug

'librarian-bots/probably-function-calling-datasets-6683d24da13a7bb7efee7464'

In [13]:
for item in results:
    add_collection_item(collection.slug, item['hub_id'], item_type="dataset")