Spaces:
Sleeping
Sleeping
| import random | |
| from datasets import load_dataset | |
| from typing import List, Tuple | |
| class DatasetLoader: | |
| """Handles dataset loading and sampling.""" | |
| def get_sample_prompts(dataset_name: str, num_samples: int, seed: int = 42) -> Tuple[List[str], List[int]]: | |
| """Get sample prompts from dataset.""" | |
| print(f"Loading dataset: {dataset_name}") | |
| dataset = load_dataset(dataset_name) | |
| split_name = 'train' if 'train' in dataset else list(dataset.keys())[0] | |
| random.seed(seed) | |
| indices = random.sample(range(len(dataset[split_name])), num_samples) | |
| # Handle different dataset formats | |
| samples = [] | |
| for idx in indices: | |
| item = dataset[split_name][idx] | |
| if 'instruction' in item: | |
| samples.append(item['instruction']) | |
| elif 'text' in item: | |
| samples.append(item['text']) | |
| elif 'prompt' in item: | |
| samples.append(item['prompt']) | |
| else: | |
| # Fallback - use first text field | |
| text_field = next(k for k, v in item.items() if isinstance(v, str)) | |
| samples.append(item[text_field]) | |
| return samples, indices |