Spaces:

SamSankar
/

hallucination-guard-env

Running

File size: 16,666 Bytes

"""
Cache Generator for HallucinationGuard-Env
==========================================
Run this ONCE on your PC to download all datasets and save them as cache files.
Then upload the cache/ folder to your HF Space.

Usage:
    pip install datasets
    python generate_cache.py

Output:
    server/cache/squad_2000.json
    server/cache/trivia_qa_2000.json
    server/cache/halueval_1000.json
    ... etc (one file per dataset)

After this finishes, upload everything:
    python -c "from huggingface_hub import HfApi; api = HfApi(); api.upload_folder(folder_path='.', repo_id='SamSankar/hallucination-guard-env', repo_type='space', ignore_patterns=['__pycache__', '*.pyc']); print('Upload complete!')"
"""

import json
import os
import sys
import time
from pathlib import Path

try:
    from datasets import load_dataset as hf_load
except ImportError:
    print("Run: pip install datasets")
    sys.exit(1)

CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "server", "cache")
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)

DATASETS = {
    "squad":         10000,
    "trivia_qa":     10000,
    "halueval":      5000,
    "truthful_qa":   817,
    "hotpotqa":      10000,
    "boolq":         9427,
    "faithdial":     10000,
    "fever":         10000,
    "arc":           3370,
    "openbookqa":    4957,
    "ms_marco":      10000,
    "coqa":          7199,
    "nq_open":       8000,
    "commonsense_qa": 8000,
    "winogrande":    5000,
}
# Target: ~100k examples

def load_squad(cap):
    ds = hf_load("squad", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        ans = item.get("answers", {}).get("text", [])
        answer = ans[0] if ans else ""
        if not answer or not item.get("context"): continue
        out.append({"question": item["question"], "context": item["context"][:1500],
                    "answer": answer, "id": f"squad_{i}", "source": "squad",
                    "difficulty": "intermediate", "category": "reading_comprehension",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_trivia_qa(cap):
    ds = hf_load("trivia_qa", "rc.wikipedia", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        cp = item.get("entity_pages", {})
        ctx = ""
        if isinstance(cp, dict):
            ctxs = cp.get("wiki_context", [])
            ctx = ctxs[0] if isinstance(ctxs, list) and ctxs else str(ctxs)
        if not ctx: continue
        aliases = item.get("answer", {}).get("normalized_aliases", [])
        answer = aliases[0] if aliases else item.get("answer", {}).get("value", "")
        if not answer: continue
        out.append({"question": item["question"], "context": ctx[:1500],
                    "answer": str(answer), "id": f"triviaqa_{i}", "source": "trivia_qa",
                    "difficulty": "intermediate", "category": "trivia",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_halueval(cap):
    ds = hf_load("pminervini/HaluEval", "qa", split=f"data[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        q = item.get("question", "")
        ctx = item.get("knowledge", item.get("context", ""))
        ans = item.get("right_answer", item.get("answer", ""))
        if not q or not ans: continue
        out.append({"question": q, "context": str(ctx)[:1500],
                    "answer": str(ans), "id": f"halueval_{i}", "source": "halueval",
                    "difficulty": "advanced", "category": "hallucination_detection",
                    "hallucination_type": item.get("hallucination_type"),
                    "entities": [], "metadata": {}})
    return out

def load_truthful_qa(cap):
    ds = hf_load("truthful_qa", "generation", split="validation")
    out = []
    for i, item in enumerate(ds):
        if i >= cap: break
        best = item.get("best_answer", "")
        correct = item.get("correct_answers", [])
        ctx = " ".join(correct) if correct else item.get("question", "")
        if not best: continue
        out.append({"question": item["question"], "context": ctx[:1500],
                    "answer": best, "id": f"truthfulqa_{i}", "source": "truthful_qa",
                    "difficulty": "expert", "category": "factuality",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_hotpotqa(cap):
    ds = hf_load("hotpot_qa", "fullwiki", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        q = item.get("question", "")
        ans = item.get("answer", "")
        titles = item.get("context", {}).get("title", [])
        sents  = item.get("context", {}).get("sentences", [])
        ctx = " ".join(f"{t}: {' '.join(s)}" for t, s in zip(titles, sents))[:1500]
        if not q or not ans or not ctx: continue
        out.append({"question": q, "context": ctx, "answer": str(ans),
                    "id": f"hotpotqa_{i}", "source": "hotpotqa",
                    "difficulty": "advanced", "category": "multi_hop_reasoning",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_boolq(cap):
    ds = hf_load("google/boolq", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        q = item.get("question", "")
        p = item.get("passage", "")
        if not q or not p: continue
        out.append({"question": q, "context": p[:1500],
                    "answer": "yes" if item.get("answer", False) else "no",
                    "id": f"boolq_{i}", "source": "boolq",
                    "difficulty": "beginner", "category": "yes_no_qa",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_faithdial(cap):
    # Anthropic HH-RLHF: human preference dialogues, 100% parquet
    ds = hf_load("Anthropic/hh-rlhf", split="train[:%d]" % cap)
    out = []
    for i, item in enumerate(ds):
        chosen = item.get("chosen", "")
        if not chosen:
            continue
        parts = chosen.split("Human:")
        question = ""
        answer = ""
        for part in parts[1:]:
            if "Assistant:" in part:
                q_part, a_part = part.split("Assistant:", 1)
                q = q_part.strip()
                a = a_part.split("Human:")[0].strip()
                if q and a:
                    question = q
                    answer = a
        if not question or not answer:
            continue
        ctx = chosen[:800]
        out.append({
            "question": question[:200],
            "context": ctx,
            "answer": answer[:400],
            "id": "faithdial_%d" % i,
            "source": "faithdial",
            "difficulty": "advanced",
            "category": "hallucination_detection",
            "hallucination_type": None,
            "entities": [],
            "metadata": {}
        })
    return out

def load_fever(cap):
    # Stanford NLI: entailment/contradiction/neutral — pure parquet, fact verification task
    ds = hf_load("stanfordnlp/snli", split=f"train[:{cap}]")
    label_map = {0: "SUPPORTS", 1: "REFUTES", 2: "NOT ENOUGH INFO", -1: "NOT ENOUGH INFO"}
    out = []
    for i, item in enumerate(ds):
        premise    = item.get("premise", "")
        hypothesis = item.get("hypothesis", "")
        label_int  = item.get("label", -1)
        label      = label_map.get(int(label_int), "NOT ENOUGH INFO")
        if not premise or not hypothesis or label_int == -1: continue
        out.append({"question": f"Does the premise support or refute this hypothesis? Hypothesis: {hypothesis}",
                    "context": f"Premise: {premise}", "answer": label,
                    "id": f"fever_{i}", "source": "fever",
                    "difficulty": "advanced", "category": "fact_verification",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out


def load_arc(cap):
    # Combine train + validation + test to get full ~3370 examples
    out = []
    for split in ["train", "validation", "test"]:
        try:
            ds = hf_load("allenai/ai2_arc", "ARC-Challenge", split=split)
            for item in ds:
                if len(out) >= cap: break
                q = item.get("question", "")
                choices = item.get("choices", {})
                ans_key = item.get("answerKey", "")
                labels = choices.get("label", [])
                texts  = choices.get("text", [])
                ctx = "Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts))
                answer = next((t for l, t in zip(labels, texts) if l == ans_key), "")
                if not q or not answer: continue
                out.append({"question": q, "context": ctx, "answer": answer,
                            "id": f"arc_{len(out)}", "source": "arc",
                            "difficulty": "advanced", "category": "science_exam",
                            "hallucination_type": None, "entities": [], "metadata": {}})
        except Exception:
            continue
    return out

def load_openbookqa(cap):
    ds = hf_load("allenai/openbookqa", "main", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        q = item.get("question_stem", "")
        choices = item.get("choices", {})
        ans_key = item.get("answerKey", "")
        labels = choices.get("label", [])
        texts  = choices.get("text", [])
        fact   = item.get("fact1", "")
        ctx = f"Core fact: {fact} | Choices: " + " | ".join(f"{l}: {t}" for l, t in zip(labels, texts))
        answer = next((t for l, t in zip(labels, texts) if l == ans_key), "")
        if not q or not answer: continue
        out.append({"question": q, "context": ctx[:1500], "answer": answer,
                    "id": f"openbookqa_{i}", "source": "openbookqa",
                    "difficulty": "intermediate", "category": "science_facts",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_ms_marco(cap):
    ds = hf_load("microsoft/ms_marco", "v2.1", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        q = item.get("query", "")
        passages = item.get("passages", {})
        texts = passages.get("passage_text", []) if isinstance(passages, dict) else []
        ctx = " ".join(texts)[:1500] if texts else ""
        answers = item.get("answers", [])
        answer = answers[0] if answers else ""
        if not q or not ctx or not answer or answer == "No Answer Present.": continue
        out.append({"question": q, "context": ctx, "answer": str(answer),
                    "id": f"msmarco_{i}", "source": "ms_marco",
                    "difficulty": "intermediate", "category": "web_search_qa",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out

def load_coqa(cap):
    ds = hf_load("stanfordnlp/coqa", split=f"train[:{cap}]")
    out = []
    for i, item in enumerate(ds):
        story = item.get("story", "")
        questions = item.get("questions", [])
        answers = item.get("answers", {})
        ans_texts = answers.get("input_text", []) if isinstance(answers, dict) else []
        if not story or not questions or not ans_texts: continue
        q = questions[0] if questions else ""
        answer = ans_texts[0] if ans_texts else ""
        if not q or not answer: continue
        out.append({"question": str(q), "context": story[:1500], "answer": str(answer),
                    "id": f"coqa_{i}", "source": "coqa",
                    "difficulty": "intermediate", "category": "conversational_qa",
                    "hallucination_type": None, "entities": [], "metadata": {}})
    return out


def load_nq_open(cap):
    ds = hf_load("nq_open", split="train[:%d]" % cap)
    out = []
    for i, item in enumerate(ds):
        q = item.get("question", "")
        answers = item.get("answer", [])
        answer = answers[0] if answers else ""
        if not q or not answer:
            continue
        out.append({
            "question": q,
            "context": "Answer based on your knowledge: " + q,
            "answer": str(answer),
            "id": "nq_open_%d" % i,
            "source": "nq_open",
            "difficulty": "intermediate",
            "category": "open_domain_qa",
            "hallucination_type": None,
            "entities": [],
            "metadata": {}
        })
    return out


def load_commonsense_qa(cap):
    ds = hf_load("tau/commonsense_qa", split="train[:%d]" % cap)
    out = []
    for i, item in enumerate(ds):
        q = item.get("question", "")
        choices = item.get("choices", {})
        labels = choices.get("label", []) if isinstance(choices, dict) else []
        texts  = choices.get("text", []) if isinstance(choices, dict) else []
        ans_key = item.get("answerKey", "")
        ctx = "Choices: " + " | ".join(
            "%s: %s" % (l, t) for l, t in zip(labels, texts))
        answer = next((t for l, t in zip(labels, texts) if l == ans_key), "")
        if not q or not answer:
            continue
        out.append({
            "question": q,
            "context": ctx,
            "answer": answer,
            "id": "csqa_%d" % i,
            "source": "commonsense_qa",
            "difficulty": "intermediate",
            "category": "commonsense_reasoning",
            "hallucination_type": None,
            "entities": [],
            "metadata": {}
        })
    return out


def load_winogrande(cap):
    # WinoGrande — commonsense reasoning, 100% parquet, no scripts
    ds = hf_load("allenai/winogrande", "winogrande_xl", split="train[:%d]" % cap)
    out = []
    for i, item in enumerate(ds):
        sentence = item.get("sentence", "")
        opt1 = item.get("option1", "")
        opt2 = item.get("option2", "")
        answer_key = str(item.get("answer", "1"))
        answer = opt1 if answer_key == "1" else opt2
        if not sentence or not answer:
            continue
        ctx = "Sentence: %s Options: 1: %s | 2: %s" % (sentence, opt1, opt2)
        out.append({
            "question": "Which option correctly fills the blank? " + sentence,
            "context": ctx,
            "answer": answer,
            "id": "winogrande_%d" % i,
            "source": "winogrande",
            "difficulty": "intermediate",
            "category": "commonsense_reasoning",
            "hallucination_type": None,
            "entities": [],
            "metadata": {}
        })
    return out



LOADERS = {
    "squad":        load_squad,
    "trivia_qa":    load_trivia_qa,
    "halueval":     load_halueval,
    "truthful_qa":  load_truthful_qa,
    "hotpotqa":     load_hotpotqa,
    "boolq":        load_boolq,
    "faithdial":    load_faithdial,
    "fever":        load_fever,
    "arc":          load_arc,
    "openbookqa":   load_openbookqa,
    "ms_marco":     load_ms_marco,
    "coqa":         load_coqa,
    "nq_open":        load_nq_open,
    "commonsense_qa": load_commonsense_qa,
    "winogrande":     load_winogrande,
}

def main():
    total = 0
    print(f"\n{'='*55}")
    print(f"  HallucinationGuard Cache Generator")
    print(f"  Output: {CACHE_DIR}")
    print(f"{'='*55}\n")

    for ds_name, cap in DATASETS.items():
        cache_file = os.path.join(CACHE_DIR, f"{ds_name}_{cap}.json")

        if os.path.exists(cache_file):
            with open(cache_file) as f:
                existing = json.load(f)
            print(f"  ✅ {ds_name}: already cached ({len(existing)} examples) — skipping")
            total += len(existing)
            continue

        print(f"  Downloading {ds_name} ({cap} examples)...", end=" ", flush=True)
        t0 = time.time()
        try:
            loader = LOADERS[ds_name]
            examples = loader(cap)
            with open(cache_file, "w") as f:
                json.dump(examples, f)
            elapsed = time.time() - t0
            total += len(examples)
            print(f"✅ {len(examples)} examples saved ({elapsed:.0f}s)")
        except Exception as e:
            print(f"❌ Failed: {e}")

    print(f"\n{'='*55}")
    print(f"  Done! Total examples cached: {total:,}")
    print(f"  Cache location: {CACHE_DIR}")
    print(f"\n  Now upload to HF Space:")
    print(f"  python -c \"from huggingface_hub import HfApi; api = HfApi(); api.upload_folder(folder_path='.', repo_id='SamSankar/hallucination-guard-env', repo_type='space', ignore_patterns=['__pycache__', '*.pyc']); print('Upload complete!')\"")
    print(f"{'='*55}\n")

if __name__ == "__main__":
    main()