Spaces:
Sleeping
Sleeping
| import json | |
| import random | |
| import urllib.request | |
| import ast | |
| from pathlib import Path | |
| from typing import Any | |
| FACTS_PATH = Path(__file__).parent / "facts.json" | |
| FAITHEVAL_COUNTERFACTUAL_URL = ( | |
| "https://raw.githubusercontent.com/SalesforceAIResearch/FaithEval/main/" | |
| "data/counterfactual.json" | |
| ) | |
| def _load_dataset(*args: Any, **kwargs: Any) -> Any: | |
| from datasets import load_dataset | |
| return load_dataset(*args, **kwargs) | |
| def _first_text(value: Any) -> str | None: | |
| """Extract the first useful text value from nested dataset fields.""" | |
| if value is None: | |
| return None | |
| if isinstance(value, str): | |
| text = value.strip() | |
| if text.startswith("[") and text.endswith("]"): | |
| try: | |
| parsed = ast.literal_eval(text) | |
| except (SyntaxError, ValueError): | |
| parsed = None | |
| parsed_text = _first_text(parsed) | |
| if parsed_text: | |
| return parsed_text | |
| return text or None | |
| if isinstance(value, (int, float)): | |
| return str(value) | |
| if isinstance(value, dict): | |
| for key in ("text", "answer", "answers", "value"): | |
| text = _first_text(value.get(key)) | |
| if text: | |
| return text | |
| return None | |
| if isinstance(value, (list, tuple)): | |
| for item in value: | |
| text = _first_text(item) | |
| if text: | |
| return text | |
| return None | |
| def _word_count(text: str) -> int: | |
| return len(text.split()) | |
| def _clean_question(text: Any) -> str | None: | |
| question = _first_text(text) | |
| if not question: | |
| return None | |
| question = question.strip() | |
| if not question.endswith("?"): | |
| question = f"{question}?" | |
| return question | |
| def _natural_questions_answer(row: dict[str, Any]) -> str | None: | |
| annotations = row.get("annotations") or {} | |
| short_answers = annotations.get("short_answers") | |
| answer = _first_text(short_answers) | |
| if answer and _word_count(answer) <= 5: | |
| return answer | |
| return None | |
| def load_natural_questions(n: int = 300) -> list[dict[str, str]]: | |
| facts: list[dict[str, str]] = [] | |
| dataset = _load_dataset( | |
| "google-research-datasets/natural_questions", | |
| split="train", | |
| streaming=True, | |
| ) | |
| for row in dataset: | |
| question = _clean_question(row.get("question") or row.get("question_text")) | |
| answer = _natural_questions_answer(row) | |
| if not question or not answer: | |
| continue | |
| facts.append( | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "source": "natural_questions", | |
| "conflict_type": "entity", | |
| } | |
| ) | |
| if len(facts) >= n: | |
| break | |
| return facts | |
| def load_popqa(n: int = 150) -> list[dict[str, str]]: | |
| facts: list[dict[str, str]] = [] | |
| dataset = _load_dataset("akariasai/PopQA", split="test") | |
| for row in dataset: | |
| question = _clean_question(row.get("question")) | |
| answer = _first_text(row.get("possible_answers")) | |
| if not question or not answer: | |
| continue | |
| facts.append( | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "source": "popqa", | |
| "conflict_type": "entity", | |
| "entity": _first_text(row.get("subj") or row.get("entity")) or "", | |
| "relation": _first_text(row.get("prop") or row.get("relation")) or "", | |
| } | |
| ) | |
| if len(facts) >= n: | |
| break | |
| return facts | |
| def _iter_faitheval_items(payload: Any) -> list[dict[str, Any]]: | |
| if isinstance(payload, list): | |
| return [item for item in payload if isinstance(item, dict)] | |
| if isinstance(payload, dict): | |
| for key in ("data", "examples", "items", "counterfactual"): | |
| items = payload.get(key) | |
| if isinstance(items, list): | |
| return [item for item in items if isinstance(item, dict)] | |
| return [] | |
| def load_faitheval_counterfactual(n: int = 100) -> list[dict[str, str]]: | |
| try: | |
| with urllib.request.urlopen(FAITHEVAL_COUNTERFACTUAL_URL, timeout=20) as response: | |
| payload = json.loads(response.read().decode("utf-8")) | |
| except Exception: | |
| return [] | |
| facts: list[dict[str, str]] = [] | |
| for item in _iter_faitheval_items(payload): | |
| question = _clean_question( | |
| item.get("question") or item.get("query") or item.get("claim") | |
| ) | |
| answer = _first_text( | |
| item.get("answer") | |
| or item.get("gold_answer") | |
| or item.get("label") | |
| or item.get("target") | |
| ) | |
| if not question or not answer: | |
| continue | |
| facts.append( | |
| { | |
| "question": question, | |
| "answer": answer, | |
| "source": "faitheval", | |
| "conflict_type": "counterfactual", | |
| "provided_context": _first_text( | |
| item.get("provided_context") | |
| or item.get("context") | |
| or item.get("evidence") | |
| ) | |
| or "", | |
| } | |
| ) | |
| if len(facts) >= n: | |
| break | |
| return facts | |
| def build_fact_database() -> list[dict[str, str]]: | |
| facts = ( | |
| load_natural_questions() | |
| + load_popqa() | |
| + load_faitheval_counterfactual() | |
| ) | |
| random.shuffle(facts) | |
| FACTS_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| with open(FACTS_PATH, "w", encoding="utf-8") as f: | |
| json.dump(facts, f, indent=2, ensure_ascii=False) | |
| counts: dict[str, int] = {} | |
| for fact in facts: | |
| source = fact.get("source", "unknown") | |
| counts[source] = counts.get(source, 0) + 1 | |
| print(f"Wrote {len(facts)} facts to {FACTS_PATH}") | |
| print(f"Source counts: {counts}") | |
| return facts | |
| if __name__ == "__main__": | |
| build_fact_database() | |