| |
| |
|
|
| import os |
| from datasets import load_dataset |
| from loguru import logger |
| import json |
| from dotenv import load_dotenv |
| import sys |
|
|
| |
| load_dotenv() |
|
|
| |
| hf_token = os.getenv("HF_TOKEN") |
| if not hf_token: |
| logger.warning("HF_TOKEN not found in .env file. Access to private datasets may be limited.") |
|
|
| |
| logger.remove() |
| logger.add( |
| "logs/yourbench_dataset_exploration.log", |
| level="INFO", |
| rotation="10 MB", |
| retention="1 week", |
| format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" |
| ) |
| |
| logger.add( |
| sys.stdout, |
| level="INFO", |
| format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}" |
| ) |
|
|
| logger.info("Starting YouRBench dataset exploration") |
|
|
| try: |
| |
| dataset_name = "yourbench/yourbench_test" |
| logger.info(f"Loading dataset: {dataset_name}") |
| dataset = load_dataset(dataset_name, token=hf_token) |
| |
| |
| logger.info(f"Dataset structure: {dataset}") |
| |
| |
| for split_name, split_dataset in dataset.items(): |
| logger.info(f"\n{'='*50}\nExploring split: {split_name}\n{'='*50}") |
| logger.info(f"Number of examples: {len(split_dataset)}") |
| logger.info(f"Features: {split_dataset.features}") |
| |
| |
| num_samples = min(3, len(split_dataset)) |
| logger.info(f"\nShowing {num_samples} sample examples:") |
| |
| for i in range(num_samples): |
| example = split_dataset[i] |
| |
| example_json = json.dumps(example, indent=2, ensure_ascii=False) |
| logger.info(f"\nExample {i}:\n{example_json}") |
| |
| |
| if hasattr(split_dataset, 'column_names'): |
| logger.info(f"\nColumn names: {split_dataset.column_names}") |
| |
| |
| for column in split_dataset.column_names: |
| try: |
| if split_dataset.features[column].dtype in ['string', 'bool', 'int32', 'int64']: |
| unique_values = set(split_dataset[column]) |
| if len(unique_values) < 20: |
| logger.info(f"Unique values in '{column}': {unique_values}") |
| except Exception as e: |
| logger.warning(f"Couldn't analyze column '{column}': {e}") |
|
|
| except Exception as e: |
| logger.error(f"Error exploring dataset: {e}") |
|
|
| logger.info("Dataset exploration completed") |