| |
| from typing import List, Dict |
| from pathlib import Path |
| import csv |
|
|
| |
| |
| |
| _DATASET_FILES: Dict[str, str] = { |
| "bar_exam": "BarExam_qa.csv", |
| "causal_judgment": "bbh_causal_judgement.csv", |
| "snarks": "bbh_snarks.csv", |
| "bbq_disamb": "BBQ_disamb.csv", |
| "cnn_dailymail": "CNN_dailymail.csv", |
| "drop": "drop.csv", |
| "esnli": "eSNLI.csv", |
| "fever": "fever.csv", |
| "hotpot_qa": "hotpot_qa.csv", |
| "medical_qa": "medical_qa.csv", |
| } |
|
|
| |
| |
| |
| _DATASET_DISPLAY_NAMES: Dict[str, str] = { |
| "bar_exam": "Bar Exam Questions", |
| "causal_judgment": "Causal Judgment", |
| "snarks": "Snarks", |
| "bbq_disamb": "BBQ Disambiguation", |
| "cnn_dailymail": "CNN / DailyMail Summaries", |
| "drop": "DROP Reading Comprehension", |
| "esnli": "e-SNLI Natural Language Inference", |
| "fever": "FEVER Fact Checking", |
| "hotpot_qa": "HotpotQA Multi-hop Questions", |
| "medical_qa": "Medical Questions", |
| } |
|
|
| |
| |
| |
| def _datasets_dir() -> Path: |
| return (Path(__file__).resolve().parent.parent / "datasets").resolve() |
|
|
| |
| |
| |
| def _pick_first_nonempty(raw: Dict[str, str], candidates: List[str]) -> str: |
| for c in candidates: |
| val = raw.get(c) |
| if val is not None and str(val).strip() != "": |
| return str(val) |
| return "" |
|
|
| |
| |
| |
| def _load_one_dataset(name: str, filename: str) -> List[Dict[str, str]]: |
| """ |
| Reads a CSV file and converts each row to our standard format: |
| |
| { |
| "id": "example_1", |
| "context": "...", |
| "prompt": "...", |
| "answer": "..." # optional |
| } |
| |
| Only the first 10 rows are kept. |
| """ |
| path = _datasets_dir() / filename |
| rows: List[Dict[str, str]] = [] |
|
|
| |
| try: |
| with path.open("r", encoding="utf-8", errors="replace", newline="") as f: |
| reader = csv.DictReader(f) |
|
|
| for i, raw in enumerate(reader, start=1): |
|
|
| ex_id = raw.get("id") or raw.get("example_id") \ |
| or raw.get("uid") or f"example_{i}" |
|
|
| context = _pick_first_nonempty(raw, [ |
| "Context", "context", |
| "passage", "article", "story", "premise", |
| "paragraph", "document", "sentence1", "sent1", "background", |
| ]) |
|
|
| prompt = _pick_first_nonempty(raw, [ |
| "Prompt", "prompt", |
| "question", "input", "query", |
| "sentence2", "sent2", "hypothesis", |
| "qa_question", "title", |
| ]) |
|
|
| answer = _pick_first_nonempty(raw, [ |
| "Answer", "answer", |
| "target", "gold", "label", "output", "reference", |
| "highlights", |
| ]) |
|
|
| ex = { |
| "id": str(ex_id), |
| "context": context, |
| "prompt": prompt, |
| } |
| if answer: |
| ex["answer"] = answer |
|
|
| rows.append(ex) |
| except FileNotFoundError: |
| return [] |
| except Exception: |
| |
| return [] |
|
|
| return rows[:10] |
|
|
| |
| |
| |
| def _load_all_datasets() -> Dict[str, List[Dict[str, str]]]: |
| return { |
| name: _load_one_dataset(name, filename) |
| for name, filename in _DATASET_FILES.items() |
| } |
|
|
| _DATA: Dict[str, List[Dict[str, str]]] = _load_all_datasets() |
|
|
| |
| |
| |
| def list_datasets() -> List[str]: |
| """Return all dataset names, sorted alphabetically.""" |
| return sorted(_DATA.keys()) |
|
|
|
|
| def get_dataset_display_name(dataset: str) -> str: |
| """Return human-readable display name for a dataset.""" |
| return _DATASET_DISPLAY_NAMES.get(dataset, dataset) |
|
|
|
|
| def get_dataset_key_from_display_name(display_name: str) -> str: |
| """Convert display name back to internal key.""" |
| |
| for key, name in _DATASET_DISPLAY_NAMES.items(): |
| if name == display_name: |
| return key |
| |
| return display_name |
|
|
|
|
| def list_datasets_with_display_names() -> List[tuple[str, str]]: |
| """Return list of (key, display_name) tuples, sorted by display name.""" |
| pairs = [(key, _DATASET_DISPLAY_NAMES.get(key, key)) for key in _DATA.keys()] |
| return sorted(pairs, key=lambda x: x[1]) |
|
|
|
|
| def list_dataset_display_names() -> List[str]: |
| """Return list of display names only, sorted alphabetically.""" |
| names = [_DATASET_DISPLAY_NAMES.get(key, key) for key in _DATA.keys()] |
| return sorted(names) |
|
|
|
|
| def get_examples(dataset: str, n: int = 10) -> List[Dict[str, str]]: |
| """Return up to n examples for a dataset.""" |
| if dataset not in _DATA: |
| raise KeyError(f"Unknown dataset: {dataset}") |
| return _DATA[dataset][:n] |
|
|
|
|
| def get_example_by_id(dataset: str, ex_id: str) -> Dict[str, str]: |
| """Return a single example whose ID matches ex_id.""" |
| if dataset not in _DATA: |
| raise KeyError(f"Unknown dataset: {dataset}") |
| for ex in _DATA[dataset]: |
| if ex["id"] == ex_id: |
| return ex |
| raise KeyError(f"Example id '{ex_id}' not found in dataset '{dataset}'") |
|
|