""" HallucinationGuard-Env — Dataset Loader v4.0 1,000,000+ examples across 38 diverse real-world QA datasets. No synthetic hackathon data. Production-grade caching per dataset. Datasets: SQuAD, SQuAD-v2, TriviaQA, HaluEval, TruthfulQA, HotpotQA, BoolQ, FaithDial, FEVER, ARC, OpenBookQA, MS MARCO, CoQA, NQ Open, CommonsenseQA, WinoGrande, AdversarialQA, AG News, AQUA-RAT, Circa, Climate-FEVER, CNN/DailyMail, HellaSwag, Medical QA, MedMCQA, MedQA, QASC, QUAIL, QuaRTz, RACE, SciQ, SciTail, XSum and more """ import json import random import os import threading from typing import List, Dict, Any, Optional from dataclasses import dataclass, field from enum import Enum from pathlib import Path class DifficultyLevel(Enum): BEGINNER = "beginner" INTERMEDIATE = "intermediate" ADVANCED = "advanced" EXPERT = "expert" class DatasetCategory(Enum): SQUAD = "squad" TRIVIAQA = "triviaqa" HALUEVAL = "halueval" TRUTHFULQA = "truthfulqa" NATURAL_QUESTIONS = "natural_questions" HOTPOTQA = "hotpotqa" BOOLQ = "boolq" FAITHDIAL = "faithdial" FEVER = "fever" ARC = "arc" OPENBOOKQA = "openbookqa" MS_MARCO = "ms_marco" COQA = "coqa" CUSTOM = "custom" @dataclass class DatasetStatistics: total_examples: int = 0 examples_by_source: Dict[str, int] = field(default_factory=dict) examples_by_difficulty: Dict[str, int] = field(default_factory=dict) examples_by_category: Dict[str, int] = field(default_factory=dict) average_context_length: float = 0.0 average_question_length: float = 0.0 @dataclass class QAExample: question: str context: str answer: str id: str source: str difficulty: DifficultyLevel = DifficultyLevel.INTERMEDIATE category: str = "" hallucination_type: Optional[str] = None entities: List[str] = field(default_factory=list) metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: return { "question": self.question, "context": self.context, "answer": self.answer, "id": self.id, "source": self.source, "difficulty": self.difficulty.value, "category": self.category, "hallucination_type": self.hallucination_type, "entities": self.entities, "metadata": self.metadata, } class DatasetLoader: """ Production-grade loader for 50k+ QA examples. Per-dataset disk cache — first boot downloads, all subsequent boots are instant. """ MAX_PER_DATASET: Dict[str, int] = { # ── Core QA ────────────────────────────────────────────────────────── "squad": 50000, "squad_v2": 50000, "trivia_qa": 50000, "hotpotqa": 50000, "coqa": 7199, "nq_open": 50000, "ms_marco": 50000, "drop": 50000, "race": 50000, "newsqa": 50000, # ── Hallucination & Factuality ──────────────────────────────────────── "halueval": 10000, "truthful_qa": 817, "fever": 50000, "climate_fever": 1535, "scitail": 23596, # ── Commonsense & Inference ─────────────────────────────────────────── "boolq": 9427, "commonsense_qa": 9741, "winogrande": 40398, "hellaswag": 40000, "circa": 34268, "adversarial_qa": 30000, # ── Science & Education ─────────────────────────────────────────────── "arc": 3370, "openbookqa": 4957, "sciq": 11679, "qasc": 8134, "quartz": 2696, "quail": 10246, # ── Medical ────────────────────────────────────────────────────────── "medqa": 10000, "medmcqa": 20000, "medical_questions": 3000, "pubmedqa": 1000, # ── Math & Reasoning ───────────────────────────────────────────────── "aqua_rat": 97467, # ── Dialogue & Grounded ─────────────────────────────────────────────── "faithdial": 50000, # ── News & Summarisation ────────────────────────────────────────────── "ag_news": 50000, "cnn_dailymail": 50000, "xsum": 50000, } # HF Dataset repo where cache files live HF_CACHE_REPO = "SamSankar/hallucination-guard-cache" # Core datasets loaded at startup (minimal set for fast cold start) # Others load in background after server is healthy CORE_DATASETS = [ "squad_50000.json", # Primary QA dataset "halueval_10000.json", # Hallucination detection "boolq_9427.json", # Boolean QA "openbookqa_4957.json", # Common knowledge "sciq_11679.json", # Science QA ] def __init__(self, cache_dir: Optional[str] = None): self.examples: List[QAExample] = [] self.used_indices: set = set() self.current_episode_examples: List[QAExample] = [] # Smart cache dir selection: # 1. Local server/cache/ (your PC dev) # 2. /tmp/cache/ (HF Space — populated from HF Dataset repo) local_cache = os.path.join(os.path.dirname(os.path.abspath(__file__)), "cache") if os.path.exists(local_cache) and len(os.listdir(local_cache)) > 0: self.cache_dir = local_cache self._using_hf_dataset = False else: self.cache_dir = "/tmp/halluguard_cache" self._using_hf_dataset = True self.statistics = DatasetStatistics() Path(self.cache_dir).mkdir(parents=True, exist_ok=True) self.indices_by_difficulty: Dict[DifficultyLevel, List[int]] = { DifficultyLevel.BEGINNER: [], DifficultyLevel.INTERMEDIATE: [], DifficultyLevel.ADVANCED: [], DifficultyLevel.EXPERT: [], } self.indices_by_category: Dict[str, List[int]] = {} def _download_from_hf_dataset(self, filename: str) -> bool: """Download a single cache file from HF Dataset repo to /tmp/halluguard_cache/""" import sys target = os.path.join(self.cache_dir, filename) if os.path.exists(target): print(f" {filename}: already cached", file=sys.stderr) return True try: from huggingface_hub import hf_hub_download print(f" {filename}: downloading...", file=sys.stderr) sys.stderr.flush() path = hf_hub_download( repo_id=self.HF_CACHE_REPO, filename=filename, repo_type="dataset", local_dir=self.cache_dir, local_dir_use_symlinks=False, ) print(f" {filename}: downloaded ✅", file=sys.stderr) return True except Exception as e: print(f" {filename}: download failed ({e})", file=sys.stderr) return False def _download_extended_in_background(self, all_files: list, core_files: list): """Download non-core datasets in background after startup.""" import sys extended = [f for f in all_files if f not in core_files] if not extended: return def _bg(): print(f" Background: downloading {len(extended)} extended datasets...", file=sys.stderr) sys.stderr.flush() for fname in extended: if self._download_from_hf_dataset(fname): # Load into memory immediately after download fpath = os.path.join(self.cache_dir, fname) try: with open(fpath, encoding="utf-8") as f: cached = json.load(f) before = len(self.examples) for ex in cached: try: diff = DifficultyLevel(ex.get("difficulty", "intermediate")) except ValueError: diff = DifficultyLevel.INTERMEDIATE self.examples.append(QAExample( question=ex["question"], context=ex["context"], answer=ex["answer"], id=ex["id"], source=ex["source"], difficulty=diff, category=ex.get("category", ""), hallucination_type=ex.get("hallucination_type"), entities=ex.get("entities", []), metadata=ex.get("metadata", {}), )) added = len(self.examples) - before self._update_statistics() self._build_indices() print(f" Background loaded {fname}: +{added:,} examples (total: {len(self.examples):,})", file=sys.stderr) sys.stderr.flush() except Exception as e: print(f" Background load error {fname}: {e}", file=sys.stderr) print(f" Background loading complete. Total: {len(self.examples):,} examples", file=sys.stderr) sys.stderr.flush() t = threading.Thread(target=_bg, daemon=True) t.start() def load_builtin_datasets(self) -> int: return 0 # Real datasets only — no synthetic data def load_real_datasets( self, max_per_dataset: int = 5000, datasets: Optional[List[str]] = None, cache: bool = True, ) -> int: """ Load datasets. On HF Space: downloads from HF Dataset repo. On local PC: reads from server/cache/ directly. Core datasets load instantly. Extended datasets load in background. """ if self._using_hf_dataset: return self._load_from_hf_dataset_repo() # Local PC path — read directly from server/cache/ try: from datasets import load_dataset as hf_load except ImportError: print("Run: pip install datasets") return 0 if datasets is None: datasets = list(self.MAX_PER_DATASET.keys()) datasets = [d for d in datasets if d != "natural_questions"] total_added = 0 for ds_name in datasets: cap = self.MAX_PER_DATASET.get(ds_name, max_per_dataset) added = self._load_single(ds_name, cap, cache, hf_load) total_added += added print(f" {ds_name}: +{added} (total: {len(self.examples)})") self._update_statistics() self._build_indices() print(f"\nDataset loading complete — {len(self.examples):,} examples ready.") return total_added def _load_from_hf_dataset_repo(self) -> int: """ HF Space startup path: 1. Download core datasets immediately (sync) 2. Download extended datasets in background (async) 3. Return once core datasets are loaded """ import sys print(f"Loading from HF Dataset repo: {self.HF_CACHE_REPO}", file=sys.stderr) sys.stderr.flush() # Get all available files in the dataset repo try: from huggingface_hub import list_repo_files all_files = [ f for f in list_repo_files(self.HF_CACHE_REPO, repo_type="dataset") if f.endswith(".json") ] print(f" Found {len(all_files)} cache files in HF Dataset repo", file=sys.stderr) sys.stderr.flush() except Exception as e: print(f" Could not list HF Dataset repo files: {e}", file=sys.stderr) all_files = self.CORE_DATASETS total_added = 0 # Step 1: Download and load core datasets synchronously print(f" Loading {len(self.CORE_DATASETS)} core datasets...", file=sys.stderr) sys.stderr.flush() for fname in self.CORE_DATASETS: if fname not in all_files: print(f" Skipping {fname} (not in repo)", file=sys.stderr) continue if self._download_from_hf_dataset(fname): fpath = os.path.join(self.cache_dir, fname) added = self._load_from_json_file(fpath) total_added += added print(f" {fname}: +{added:,} (total: {len(self.examples):,})", file=sys.stderr) sys.stderr.flush() self._update_statistics() self._build_indices() print(f" Core datasets loaded: {len(self.examples):,} examples ready ✅", file=sys.stderr) sys.stderr.flush() # Step 2: Download extended datasets in background self._download_extended_in_background(all_files, self.CORE_DATASETS) return total_added def _load_from_json_file(self, fpath: str) -> int: """Load a single JSON cache file into self.examples.""" before = len(self.examples) try: with open(fpath, encoding="utf-8") as f: cached = json.load(f) for ex in cached: try: diff = DifficultyLevel(ex.get("difficulty", "intermediate")) except ValueError: diff = DifficultyLevel.INTERMEDIATE self.examples.append(QAExample( question=ex["question"], context=ex["context"], answer=ex["answer"], id=ex["id"], source=ex["source"], difficulty=diff, category=ex.get("category", ""), hallucination_type=ex.get("hallucination_type"), entities=ex.get("entities", []), metadata=ex.get("metadata", {}), )) return len(self.examples) - before except Exception as e: print(f" Error loading {fpath}: {e}") return 0 def _load_single(self, ds_name: str, cap: int, cache: bool, hf_load) -> int: cache_file = os.path.join(self.cache_dir, f"{ds_name}_{cap}.json") if cache and os.path.exists(cache_file): try: added = self._load_from_json_file(cache_file) return added except Exception as e: print(f" Cache miss for {ds_name} ({e}), re-downloading.") loader = getattr(self, f"_load_{ds_name.replace('-','_')}", None) if not loader: print(f" No loader for {ds_name}") return 0 try: new_examples = loader(cap, hf_load) except Exception as e: print(f" Failed {ds_name}: {e}") return 0 if not new_examples: return 0 if cache: try: with open(cache_file, "w") as f: json.dump([e.to_dict() for e in new_examples], f) except Exception as e: print(f" Cache write failed for {ds_name}: {e}") before = len(self.examples) self.examples.extend(new_examples) return len(self.examples) - before # ── Dataset loaders ─────────────────────────────────────────────────────── def _load_squad(self, cap, hf_load): ds = hf_load("squad", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): ans = item.get("answers", {}).get("text", []) answer = ans[0] if ans else "" if not answer or not item.get("context"): continue out.append(QAExample( question=item["question"], context=item["context"][:1500], answer=answer, id=f"squad_{i}", source="squad", difficulty=DifficultyLevel.INTERMEDIATE, category="reading_comprehension")) return out def _load_trivia_qa(self, cap, hf_load): ds = hf_load("trivia_qa", "rc.wikipedia", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): cp = item.get("entity_pages", {}) ctx = "" if isinstance(cp, dict): ctxs = cp.get("wiki_context", []) ctx = ctxs[0] if isinstance(ctxs, list) and ctxs else str(ctxs) if not ctx: continue aliases = item.get("answer", {}).get("normalized_aliases", []) answer = aliases[0] if aliases else item.get("answer", {}).get("value", "") if not answer: continue out.append(QAExample( question=item["question"], context=ctx[:1500], answer=str(answer), id=f"triviaqa_{i}", source="trivia_qa", difficulty=DifficultyLevel.INTERMEDIATE, category="trivia")) return out def _load_halueval(self, cap, hf_load): ds = hf_load("pminervini/HaluEval", "qa", split=f"data[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") ctx = item.get("knowledge", item.get("context", "")) ans = item.get("right_answer", item.get("answer", "")) if not q or not ans: continue out.append(QAExample( question=q, context=str(ctx)[:1500], answer=str(ans), id=f"halueval_{i}", source="halueval", difficulty=DifficultyLevel.ADVANCED, category="hallucination_detection", hallucination_type=item.get("hallucination_type"))) return out def _load_truthful_qa(self, cap, hf_load): ds = hf_load("truthful_qa", "generation", split="validation") out = [] for i, item in enumerate(ds): if i >= cap: break best = item.get("best_answer", "") correct = item.get("correct_answers", []) ctx = " ".join(correct) if correct else item.get("question", "") if not best: continue out.append(QAExample( question=item["question"], context=ctx[:1500], answer=best, id=f"truthfulqa_{i}", source="truthful_qa", difficulty=DifficultyLevel.EXPERT, category="factuality")) return out def _load_natural_questions(self, cap, hf_load): ds = hf_load("google-research-datasets/natural_questions", "default", split=f"train[:{cap}]", trust_remote_code=True) out = [] for i, item in enumerate(ds): q = item.get("question", {}) if isinstance(q, dict): q = q.get("text", "") ctx_doc = item.get("document", {}) if isinstance(ctx_doc, dict): tokens = ctx_doc.get("tokens", {}) ctx = " ".join(tokens.get("token", []))[:1500] if isinstance(tokens, dict) else "" else: ctx = "" ann = item.get("annotations", {}) answer = "" if isinstance(ann, dict): sa = ann.get("short_answers", []) if sa and isinstance(sa, list): first = sa[0] if isinstance(first, dict): texts = first.get("text", []) answer = texts[0] if texts else "" if not q or not answer or not ctx: continue out.append(QAExample( question=str(q), context=ctx, answer=str(answer), id=f"nq_{i}", source="natural_questions", difficulty=DifficultyLevel.INTERMEDIATE, category="open_domain_qa")) return out def _load_hotpotqa(self, cap, hf_load): ds = hf_load("hotpot_qa", "fullwiki", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") ans = item.get("answer", "") titles = item.get("context", {}).get("title", []) sents = item.get("context", {}).get("sentences", []) ctx = " ".join(f"{t}: {' '.join(s)}" for t, s in zip(titles, sents))[:1500] if not q or not ans or not ctx: continue out.append(QAExample( question=q, context=ctx, answer=str(ans), id=f"hotpotqa_{i}", source="hotpotqa", difficulty=DifficultyLevel.EXPERT, category="multi_hop_reasoning")) return out def _load_boolq(self, cap, hf_load): ds = hf_load("google/boolq", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") p = item.get("passage", "") if not q or not p: continue out.append(QAExample( question=q, context=p[:1500], answer="yes" if item.get("answer", False) else "no", id=f"boolq_{i}", source="boolq", difficulty=DifficultyLevel.INTERMEDIATE, category="yes_no_qa")) return out def _load_faithdial(self, cap, hf_load): ds = hf_load("facebook/wizard_of_wikipedia", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): chosen_topic = item.get("chosen_topic", "") passages = item.get("passages", {}) ctx_list = passages.get("passage", []) if isinstance(passages, dict) else [] ctx = " ".join(ctx_list[:3])[:1500] if ctx_list else chosen_topic dialogs = item.get("dialog", []) if not dialogs or not ctx: continue question = answer = "" for turn in dialogs: if not question and turn.get("speaker", "") == "0_Apprentice": question = turn.get("text", "") elif question and turn.get("speaker", "") == "1_Wizard": answer = turn.get("text", "") break if not question or not answer: continue out.append(QAExample( question=question, context=ctx, answer=answer, id=f"faithdial_{i}", source="faithdial", difficulty=DifficultyLevel.EXPERT, category="hallucination_detection")) return out def _load_fever(self, cap, hf_load): ds = hf_load("liar", split=f"train[:{cap}]") label_map = { "true": "SUPPORTS", "mostly-true": "SUPPORTS", "half-true": "NOT ENOUGH INFO", "barely-true": "REFUTES", "false": "REFUTES", "pants-fire": "REFUTES" } out = [] for i, item in enumerate(ds): statement = item.get("statement", "") label = label_map.get(item.get("label", ""), "NOT ENOUGH INFO") ctx = f"Speaker: {item.get('speaker','')}. Subject: {item.get('subject','')}. Statement: {statement}" if not statement: continue out.append(QAExample( question=f"Is this claim SUPPORTS, REFUTES, or NOT ENOUGH INFO? Claim: {statement}", context=ctx[:1500], answer=label, id=f"fever_{i}", source="fever", difficulty=DifficultyLevel.EXPERT, category="fact_verification")) return out def _load_arc(self, cap, hf_load): out = [] for split in ["train", "validation", "test"]: try: ds = hf_load("allenai/ai2_arc", "ARC-Challenge", split=split) for item in ds: if len(out) >= cap: break q = item.get("question", "") choices = item.get("choices", {}) ans_key = item.get("answerKey", "") labels = choices.get("label", []) texts = choices.get("text", []) ctx = "Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts)) answer = next((t for l, t in zip(labels, texts) if l == ans_key), "") if not q or not answer: continue out.append(QAExample( question=q, context=ctx, answer=answer, id=f"arc_{len(out)}", source="arc", difficulty=DifficultyLevel.EXPERT, category="science_exam")) except Exception: continue return out def _load_openbookqa(self, cap, hf_load): ds = hf_load("allenai/openbookqa", "main", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question_stem", "") choices = item.get("choices", {}) ans_key = item.get("answerKey", "") labels = choices.get("label", []) texts = choices.get("text", []) fact = item.get("fact1", "") ctx = f"Core fact: {fact} | Choices: " + " | ".join( f"{l}: {t}" for l, t in zip(labels, texts)) answer = next((t for l, t in zip(labels, texts) if l == ans_key), "") if not q or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"openbookqa_{i}", source="openbookqa", difficulty=DifficultyLevel.ADVANCED, category="science_facts")) return out def _load_ms_marco(self, cap, hf_load): ds = hf_load("microsoft/ms_marco", "v2.1", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("query", "") passages = item.get("passages", {}) texts = passages.get("passage_text", []) if isinstance(passages, dict) else [] ctx = " ".join(texts)[:1500] if texts else "" answers = item.get("answers", []) answer = answers[0] if answers else "" if not q or not ctx or not answer or answer == "No Answer Present.": continue out.append(QAExample( question=q, context=ctx, answer=str(answer), id=f"msmarco_{i}", source="ms_marco", difficulty=DifficultyLevel.INTERMEDIATE, category="web_search_qa")) return out def _load_coqa(self, cap, hf_load): ds = hf_load("stanfordnlp/coqa", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): story = item.get("story", "") questions = item.get("questions", []) answers = item.get("answers", {}) ans_texts = answers.get("input_text", []) if isinstance(answers, dict) else [] if not story or not questions or not ans_texts: continue q = questions[0] if questions else "" answer = ans_texts[0] if ans_texts else "" if not q or not answer: continue out.append(QAExample( question=str(q), context=story[:1500], answer=str(answer), id=f"coqa_{i}", source="coqa", difficulty=DifficultyLevel.INTERMEDIATE, category="conversational_qa")) return out # ── Sampling ────────────────────────────────────────────────────────────── def get_example_by_difficulty(self, difficulty: DifficultyLevel, exclude_used: bool = True) -> Optional[QAExample]: indices = self.indices_by_difficulty.get(difficulty, []) available = [i for i in indices if i not in self.used_indices] if exclude_used else list(indices) if not available: for diff in [DifficultyLevel.INTERMEDIATE, DifficultyLevel.BEGINNER, DifficultyLevel.ADVANCED, DifficultyLevel.EXPERT]: if diff != difficulty: fb = self.indices_by_difficulty.get(diff, []) available = [i for i in fb if i not in self.used_indices] if exclude_used else list(fb) if available: break if not available: return None idx = random.choice(available) self.used_indices.add(idx) return self.examples[idx] def get_random_example(self, difficulty: Optional[DifficultyLevel] = None) -> Optional[QAExample]: if difficulty: return self.get_example_by_difficulty(difficulty) if not self.examples: return None available = [i for i in range(len(self.examples)) if i not in self.used_indices] if not available: self.used_indices.clear() available = list(range(len(self.examples))) idx = random.choice(available) self.used_indices.add(idx) return self.examples[idx] def start_new_episode(self, num_questions: int = 10, difficulty: Optional[DifficultyLevel] = None, category: Optional[str] = None, mix_difficulties: bool = False) -> List[QAExample]: self.current_episode_examples = [] if mix_difficulties: for diff in ([DifficultyLevel.BEGINNER]*2 + [DifficultyLevel.INTERMEDIATE]*3 + [DifficultyLevel.ADVANCED]*3 + [DifficultyLevel.EXPERT]*2)[:num_questions]: ex = self.get_example_by_difficulty(diff) if ex: self.current_episode_examples.append(ex) elif difficulty: for _ in range(num_questions): ex = self.get_example_by_difficulty(difficulty) if ex: self.current_episode_examples.append(ex) else: for _ in range(num_questions): ex = self.get_random_example() if ex: self.current_episode_examples.append(ex) while len(self.current_episode_examples) < num_questions: ex = self.get_random_example() if ex: self.current_episode_examples.append(ex) else: break return self.current_episode_examples def get_example_for_step(self, step: int) -> Optional[QAExample]: if 0 <= step < len(self.current_episode_examples): return self.current_episode_examples[step] return None def load_from_json(self, filepath: str) -> int: initial = len(self.examples) try: with open(filepath, "r", encoding="utf-8") as f: data = json.load(f) for item in data: try: diff = DifficultyLevel(item.get("difficulty", "intermediate")) except ValueError: diff = DifficultyLevel.INTERMEDIATE self.examples.append(QAExample( question=item.get("question", ""), context=item.get("context", ""), answer=item.get("answer", ""), id=item.get("id", str(len(self.examples))), source=item.get("source", "custom"), difficulty=diff, category=item.get("category", "general"), entities=item.get("entities", []), metadata=item.get("metadata", {}))) self._update_statistics() self._build_indices() return len(self.examples) - initial except Exception as e: print(f"load_from_json error: {e}") return 0 def get_statistics(self) -> DatasetStatistics: return self.statistics def get_total_examples(self) -> int: return len(self.examples) def reset_usage(self) -> None: self.used_indices.clear() def _load_nq_open(self, cap, hf_load): ds = hf_load("nq_open", split="train[:%d]" % cap) out = [] for i, item in enumerate(ds): q = item.get("question", "") answers = item.get("answer", []) answer = answers[0] if answers else "" if not q or not answer: continue out.append(QAExample( question=q, context="Answer based on your knowledge: " + q, answer=str(answer), id="nq_open_%d" % i, source="nq_open", difficulty=DifficultyLevel.INTERMEDIATE, category="open_domain_qa")) return out def _load_commonsense_qa(self, cap, hf_load): ds = hf_load("tau/commonsense_qa", split="train[:%d]" % cap) out = [] for i, item in enumerate(ds): q = item.get("question", "") choices = item.get("choices", {}) labels = choices.get("label", []) if isinstance(choices, dict) else [] texts = choices.get("text", []) if isinstance(choices, dict) else [] ans_key = item.get("answerKey", "") ctx = "Choices: " + " | ".join( "%s: %s" % (l, t) for l, t in zip(labels, texts)) answer = next((t for l, t in zip(labels, texts) if l == ans_key), "") if not q or not answer: continue out.append(QAExample( question=q, context=ctx, answer=answer, id="csqa_%d" % i, source="commonsense_qa", difficulty=DifficultyLevel.INTERMEDIATE, category="commonsense_reasoning")) return out def _load_winogrande(self, cap, hf_load): ds = hf_load("allenai/winogrande", "winogrande_xl", split="train[:%d]" % cap) out = [] for i, item in enumerate(ds): sentence = item.get("sentence", "") opt1 = item.get("option1", "") opt2 = item.get("option2", "") answer_key = str(item.get("answer", "1")) answer = opt1 if answer_key == "1" else opt2 if not sentence or not answer: continue ctx = "Sentence: %s Options: 1: %s | 2: %s" % (sentence, opt1, opt2) out.append(QAExample( question="Which option correctly fills the blank? " + sentence, context=ctx, answer=answer, id="winogrande_%d" % i, source="winogrande", difficulty=DifficultyLevel.INTERMEDIATE, category="commonsense_reasoning")) return out # ── New v4.0 Dataset Loaders ────────────────────────────────────────────── def _load_squad_v2(self, cap, hf_load): ds = hf_load("rajpurkar/squad_v2", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): ans = item.get("answers", {}).get("text", []) answer = ans[0] if ans else "No answer" ctx = item.get("context", "") if not ctx: continue out.append(QAExample( question=item["question"], context=ctx[:1500], answer=answer, id=f"squadv2_{i}", source="squad_v2", difficulty=DifficultyLevel.ADVANCED, category="reading_comprehension_unanswerable")) return out def _load_drop(self, cap, hf_load): try: ds = hf_load("ucinlp/drop", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") passage = item.get("passage", "") answers = item.get("answers_spans", {}) spans = answers.get("spans", []) if isinstance(answers, dict) else [] answer = spans[0] if spans else "" if not q or not passage or not answer: continue out.append(QAExample( question=q, context=passage[:1500], answer=str(answer), id=f"drop_{i}", source="drop", difficulty=DifficultyLevel.EXPERT, category="numerical_reasoning")) return out except Exception as e: print(f" drop loader error: {e}"); return [] def _load_race(self, cap, hf_load): out = [] try: for split in ["train", "validation", "test"]: ds = hf_load("ehovy/race", "all", split=split) for item in ds: if len(out) >= cap: break q = item.get("question", "") article = item.get("article", "") options = item.get("options", []) ans_key = item.get("answer", "") key_map = {"A": 0, "B": 1, "C": 2, "D": 3} idx = key_map.get(ans_key, -1) answer = options[idx] if 0 <= idx < len(options) else "" if not q or not article or not answer: continue out.append(QAExample( question=q, context=article[:1500], answer=answer, id=f"race_{len(out)}", source="race", difficulty=DifficultyLevel.ADVANCED, category="reading_comprehension_exam")) except Exception as e: print(f" race loader error: {e}") return out def _load_newsqa(self, cap, hf_load): try: ds = hf_load("lucadiliello/newsqa", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") story = item.get("story_text", item.get("context", "")) answers = item.get("answers", {}) if isinstance(answers, list) and answers: answer = str(answers[0]) elif isinstance(answers, dict): answer = str(answers.get("answer_token_ranges", "")) else: answer = "" if not q or not story or not answer: continue out.append(QAExample( question=str(q), context=str(story)[:1500], answer=answer, id=f"newsqa_{i}", source="newsqa", difficulty=DifficultyLevel.INTERMEDIATE, category="news_qa")) return out except Exception as e: print(f" newsqa loader error: {e}"); return [] def _load_hellaswag(self, cap, hf_load): try: ds = hf_load("Rowan/hellaswag", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): ctx = item.get("ctx", "") endings = item.get("endings", []) label = item.get("label", "") try: idx = int(label) answer = endings[idx] if 0 <= idx < len(endings) else "" except (ValueError, TypeError): answer = "" if not ctx or not answer: continue choices_str = " | ".join(f"{j}: {e}" for j, e in enumerate(endings)) out.append(QAExample( question=f"What is the most likely continuation? {ctx}", context=f"Context: {ctx} | Choices: {choices_str}", answer=answer, id=f"hellaswag_{i}", source="hellaswag", difficulty=DifficultyLevel.INTERMEDIATE, category="commonsense_completion")) return out except Exception as e: print(f" hellaswag loader error: {e}"); return [] def _load_adversarial_qa(self, cap, hf_load): try: ds = hf_load("adversarial_qa", "adversarialQA", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") ctx = item.get("context", "") ans = item.get("answers", {}).get("text", []) answer = ans[0] if ans else "" if not q or not ctx or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"advqa_{i}", source="adversarial_qa", difficulty=DifficultyLevel.EXPERT, category="adversarial_reading_comprehension")) return out except Exception as e: print(f" adversarial_qa loader error: {e}"); return [] def _load_ag_news(self, cap, hf_load): try: ds = hf_load("fancyzhx/ag_news", split=f"train[:{cap}]") label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Science/Technology"} out = [] for i, item in enumerate(ds): text = item.get("text", "") label = label_map.get(item.get("label", -1), "") if not text or not label: continue out.append(QAExample( question="What is the topic category of this news article?", context=text[:1500], answer=label, id=f"agnews_{i}", source="ag_news", difficulty=DifficultyLevel.BEGINNER, category="news_classification")) return out except Exception as e: print(f" ag_news loader error: {e}"); return [] def _load_aqua_rat(self, cap, hf_load): try: ds = hf_load("aqua_rat", "raw", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") options = item.get("options", []) correct = item.get("correct", "") rationale = item.get("rationale", "") answer = next((o for o in options if o.startswith(correct + ")")), correct) ctx = f"Options: {' | '.join(options)} | Rationale: {rationale}" if not q or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"aquarat_{i}", source="aqua_rat", difficulty=DifficultyLevel.EXPERT, category="math_word_problems")) return out except Exception as e: print(f" aqua_rat loader error: {e}"); return [] def _load_circa(self, cap, hf_load): try: ds = hf_load("circa", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question_x", "") ans = item.get("answer_y", "") ctx = item.get("context", item.get("canquestion_x", q)) judgement = item.get("goldstandard1", "") if not q or not ans: continue out.append(QAExample( question=q, context=str(ctx)[:1500], answer=str(ans), id=f"circa_{i}", source="circa", difficulty=DifficultyLevel.INTERMEDIATE, category="social_context_qa")) return out except Exception as e: print(f" circa loader error: {e}"); return [] def _load_climate_fever(self, cap, hf_load): try: ds = hf_load("climate_fever", split=f"test[:{cap}]") out = [] for i, item in enumerate(ds): claim = item.get("claim", "") label_map = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO", 3: "DISPUTED"} label = label_map.get(item.get("claim_label", 2), "NOT ENOUGH INFO") evidences = item.get("evidences", []) ctx = " ".join([e.get("evidence", "") for e in evidences[:3]])[:1500] if evidences else claim if not claim: continue out.append(QAExample( question=f"Is this climate claim supported? Claim: {claim}", context=ctx, answer=label, id=f"climatefever_{i}", source="climate_fever", difficulty=DifficultyLevel.EXPERT, category="climate_fact_verification")) return out except Exception as e: print(f" climate_fever loader error: {e}"); return [] def _load_cnn_dailymail(self, cap, hf_load): try: ds = hf_load("abisee/cnn_dailymail", "3.0.0", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): article = item.get("article", "") highlights = item.get("highlights", "") if not article or not highlights: continue out.append(QAExample( question="What are the key points of this article?", context=article[:1500], answer=highlights[:500], id=f"cnndm_{i}", source="cnn_dailymail", difficulty=DifficultyLevel.INTERMEDIATE, category="news_summarisation")) return out except Exception as e: print(f" cnn_dailymail loader error: {e}"); return [] def _load_scitail(self, cap, hf_load): try: ds = hf_load("allenai/scitail", "tsv_format", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): premise = item.get("premise", "") hypothesis = item.get("hypothesis", "") label = item.get("label", "neutral") answer = "SUPPORTS" if label == "entails" else "REFUTES" if not premise or not hypothesis: continue out.append(QAExample( question=f"Does the premise support this hypothesis? Hypothesis: {hypothesis}", context=f"Premise: {premise}", answer=answer, id=f"scitail_{i}", source="scitail", difficulty=DifficultyLevel.ADVANCED, category="science_entailment")) return out except Exception as e: print(f" scitail loader error: {e}"); return [] def _load_medqa(self, cap, hf_load): try: ds = hf_load("GBaker/MedQA-USMLE-4-options", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") options = item.get("options", {}) answer_idx = item.get("answer_idx", "") answer = options.get(answer_idx, "") if isinstance(options, dict) else "" ctx = "Options: " + " | ".join(f"{k}: {v}" for k, v in options.items()) if isinstance(options, dict) else "" if not q or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"medqa_{i}", source="medqa", difficulty=DifficultyLevel.EXPERT, category="medical_qa")) return out except Exception as e: print(f" medqa loader error: {e}"); return [] def _load_medmcqa(self, cap, hf_load): try: ds = hf_load("openlifescienceai/medmcqa", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") opts = [item.get(k, "") for k in ["opa", "opb", "opc", "opd"]] cop = item.get("cop", 0) answer = opts[cop - 1] if 1 <= cop <= 4 else "" ctx = "Options: " + " | ".join(f"{chr(65+j)}: {o}" for j, o in enumerate(opts)) if not q or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"medmcqa_{i}", source="medmcqa", difficulty=DifficultyLevel.EXPERT, category="medical_mcq")) return out except Exception as e: print(f" medmcqa loader error: {e}"); return [] def _load_medical_questions(self, cap, hf_load): try: ds = hf_load("medical_questions_pairs", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q1 = item.get("question_1", "") q2 = item.get("question_2", "") label = item.get("label", 0) answer = "similar" if label == 1 else "different" ctx = f"Question 1: {q1} | Question 2: {q2}" if not q1 or not q2: continue out.append(QAExample( question=f"Are these two medical questions asking about the same thing?", context=ctx[:1500], answer=answer, id=f"medpairs_{i}", source="medical_questions", difficulty=DifficultyLevel.ADVANCED, category="medical_similarity")) return out except Exception as e: print(f" medical_questions loader error: {e}"); return [] def _load_qasc(self, cap, hf_load): try: ds = hf_load("allenai/qasc", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") choices = item.get("choices", {}) labels = choices.get("label", []) if isinstance(choices, dict) else [] texts = choices.get("text", []) if isinstance(choices, dict) else [] ans_key = item.get("answerKey", "") fact1 = item.get("fact1", "") fact2 = item.get("fact2", "") answer = next((t for l, t in zip(labels, texts) if l == ans_key), "") ctx = f"Fact 1: {fact1} | Fact 2: {fact2} | Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts)) if not q or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"qasc_{i}", source="qasc", difficulty=DifficultyLevel.ADVANCED, category="multi_hop_science")) return out except Exception as e: print(f" qasc loader error: {e}"); return [] def _load_quartz(self, cap, hf_load): try: ds = hf_load("allenai/quartz", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") choices = item.get("choices", {}) labels = choices.get("label", []) if isinstance(choices, dict) else [] texts = choices.get("text", []) if isinstance(choices, dict) else [] ans_key = item.get("answerKey", "") para = item.get("para", "") answer = next((t for l, t in zip(labels, texts) if l == ans_key), "") ctx = f"{para} | Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts)) if not q or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"quartz_{i}", source="quartz", difficulty=DifficultyLevel.ADVANCED, category="qualitative_science")) return out except Exception as e: print(f" quartz loader error: {e}"); return [] def _load_quail(self, cap, hf_load): try: ds = hf_load("potsawee/quail", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") ctx = item.get("context", "") answers = item.get("answers", []) correct_idx = item.get("correct_answer_id", 0) answer = answers[correct_idx] if answers and 0 <= correct_idx < len(answers) else "" if not q or not ctx or not answer: continue out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"quail_{i}", source="quail", difficulty=DifficultyLevel.ADVANCED, category="reading_comprehension")) return out except Exception as e: print(f" quail loader error: {e}"); return [] def _load_pubmedqa(self, cap, hf_load): try: ds = hf_load("qiaojin/PubMedQA", "pqa_labeled", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") ctx_list = item.get("context", {}) if isinstance(ctx_list, dict): ctx = " ".join(ctx_list.get("contexts", []))[:1500] else: ctx = str(ctx_list)[:1500] answer = item.get("long_answer", item.get("final_decision", "")) if not q or not ctx or not answer: continue out.append(QAExample( question=q, context=ctx, answer=str(answer), id=f"pubmedqa_{i}", source="pubmedqa", difficulty=DifficultyLevel.EXPERT, category="biomedical_qa")) return out except Exception as e: print(f" pubmedqa loader error: {e}"); return [] def _load_xsum(self, cap, hf_load): try: ds = hf_load("EdinburghNLP/xsum", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): doc = item.get("document", "") summary = item.get("summary", "") if not doc or not summary: continue out.append(QAExample( question="Summarise this document in one sentence.", context=doc[:1500], answer=summary, id=f"xsum_{i}", source="xsum", difficulty=DifficultyLevel.ADVANCED, category="summarisation")) return out except Exception as e: print(f" xsum loader error: {e}"); return [] def _load_sciq(self, cap, hf_load): try: ds = hf_load("allenai/sciq", split=f"train[:{cap}]") out = [] for i, item in enumerate(ds): q = item.get("question", "") support = item.get("support", "") answer = item.get("correct_answer", "") d1 = item.get("distractor1", "") d2 = item.get("distractor2", "") d3 = item.get("distractor3", "") if not q or not answer: continue ctx = f"{support} | Options: {answer} | {d1} | {d2} | {d3}" if support else f"Options: {answer} | {d1} | {d2} | {d3}" out.append(QAExample( question=q, context=ctx[:1500], answer=answer, id=f"sciq_{i}", source="sciq", difficulty=DifficultyLevel.ADVANCED, category="science_qa")) return out except Exception as e: print(f" sciq loader error: {e}"); return [] def _update_statistics(self) -> None: self.statistics.total_examples = len(self.examples) self.statistics.examples_by_source = {} self.statistics.examples_by_difficulty = {} self.statistics.examples_by_category = {} for ex in self.examples: for d, k in [(self.statistics.examples_by_source, ex.source), (self.statistics.examples_by_difficulty, ex.difficulty.value), (self.statistics.examples_by_category, ex.category)]: d[k] = d.get(k, 0) + 1 if self.examples: self.statistics.average_context_length = sum(len(e.context) for e in self.examples) / len(self.examples) self.statistics.average_question_length = sum(len(e.question) for e in self.examples) / len(self.examples) def _build_indices(self) -> None: self.indices_by_difficulty = {d: [] for d in DifficultyLevel} self.indices_by_category = {} for i, ex in enumerate(self.examples): self.indices_by_difficulty[ex.difficulty].append(i) if ex.category not in self.indices_by_category: self.indices_by_category[ex.category] = [] self.indices_by_category[ex.category].append(i)