"""Build Extended Reference KG from Wikidata. Downloads temporal facts (positions held, P39) for the top ~200 most notable politicians and saves them to data/reference_kg/verified_events.json, replacing or extending the existing 16-entity Reference KG. The script queries Wikidata for: - US Presidents and Vice Presidents - US Senators and Representatives (most notable) - Secretaries of State and Defense - UK Prime Ministers - EU/European leaders (Merkel, Macron, etc.) - Other major world leaders Usage: python scripts/build_reference_kg.py python scripts/build_reference_kg.py --dry-run # show entities, don't save python scripts/build_reference_kg.py --merge # merge with existing KG Output: data/reference_kg/verified_events.json data/reference_kg/build_log_YYYY-MM-DD.json """ from __future__ import annotations import argparse import json import logging import time from datetime import datetime from pathlib import Path import sys _PROJECT_ROOT = Path(__file__).parent.parent if str(_PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(_PROJECT_ROOT)) logging.basicConfig( level=logging.INFO, format="%(asctime)s %(name)-30s %(levelname)s %(message)s", datefmt="%H:%M:%S", ) logger = logging.getLogger("build_reference_kg") OUTPUT_DIR = _PROJECT_ROOT / "data" / "reference_kg" OUTPUT_FILE = OUTPUT_DIR / "verified_events.json" LOG_FILE = OUTPUT_DIR / f"build_log_{datetime.now().strftime('%Y-%m-%d')}.json" OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # delay between Wikidata queries to avoid rate limiting QUERY_DELAY = 1.5 # section: entity list # Format: (name, wikidata_QID, category) # QIDs verified manually from Wikidata ENTITIES = [ # US Presidents ("Barack Obama", "Q76", "us_president"), ("Donald Trump", "Q22686", "us_president"), ("Joe Biden", "Q6279", "us_president"), ("Bill Clinton", "Q1124", "us_president"), ("George W. Bush", "Q207", "us_president"), ("George H.W. Bush", "Q23505", "us_president"), ("Ronald Reagan", "Q9960", "us_president"), ("Jimmy Carter", "Q23685", "us_president"), ("Gerald Ford", "Q9916", "us_president"), ("Richard Nixon", "Q9588", "us_president"), ("Lyndon B. Johnson", "Q9640", "us_president"), ("John F. Kennedy", "Q9696", "us_president"), ("Dwight Eisenhower", "Q9916", "us_president"), ("Harry S. Truman", "Q9682", "us_president"), # US Vice Presidents ("Mike Pence", "Q373472", "us_vp"), ("Dick Cheney", "Q48259", "us_vp"), ("Al Gore", "Q19673", "us_vp"), ("Dan Quayle", "Q48305", "us_vp"), ("Walter Mondale", "Q223779", "us_vp"), ("Kamala Harris", "Q185530", "us_vp"), # US Senators (notable) ("Hillary Clinton", "Q6294", "us_senator"), ("John McCain", "Q10390", "us_senator"), ("Bernie Sanders", "Q359442", "us_senator"), ("Elizabeth Warren", "Q1173433","us_senator"), ("Mitch McConnell", "Q355522", "us_senator"), ("Chuck Schumer", "Q380900", "us_senator"), ("Marco Rubio", "Q349455", "us_senator"), ("Ted Cruz", "Q905014", "us_senator"), ("Rand Paul", "Q671426", "us_senator"), ("Lindsey Graham", "Q378702", "us_senator"), ("Amy Klobuchar", "Q23714", "us_senator"), ("Cory Booker", "Q922820", "us_senator"), ("Bob Menendez", "Q539566", "us_senator"), ("Dianne Feinstein", "Q231033", "us_senator"), ("Susan Collins", "Q276932", "us_senator"), ("Lisa Murkowski", "Q230539", "us_senator"), # US House Representatives (notable) ("Nancy Pelosi", "Q170581", "us_representative"), ("Kevin McCarthy", "Q977592", "us_representative"), ("Paul Ryan", "Q381812", "us_representative"), ("John Boehner", "Q310461", "us_representative"), ("Newt Gingrich", "Q313756", "us_representative"), ("Alexandria Ocasio-Cortez","Q55223754","us_representative"), ("Ilhan Omar", "Q55405334","us_representative"), ("Matt Gaetz", "Q24834954","us_representative"), ("Jim Jordan", "Q1689040", "us_representative"), ("Adam Schiff", "Q350843", "us_representative"), ("Jerry Nadler", "Q555961", "us_representative"), ("Maxine Waters", "Q234302", "us_representative"), ("Steny Hoyer", "Q558566", "us_representative"), ("Eric Cantor", "Q726155", "us_representative"), # US Secretaries of State / Defense ("John Kerry", "Q33866", "us_cabinet"), ("Rex Tillerson", "Q7313083","us_cabinet"), ("Mike Pompeo", "Q974374", "us_cabinet"), ("Antony Blinken", "Q720301", "us_cabinet"), ("Condoleezza Rice", "Q62398", "us_cabinet"), ("Colin Powell", "Q34296", "us_cabinet"), ("Madeleine Albright", "Q41225", "us_cabinet"), ("Robert Gates", "Q312930", "us_cabinet"), ("Donald Rumsfeld", "Q48259", "us_cabinet"), ("Lloyd Austin", "Q6664165","us_cabinet"), # US Governors (notable) ("Mitt Romney", "Q1036", "us_governor"), ("Arnold Schwarzenegger", "Q2685", "us_governor"), ("Andrew Cuomo", "Q312528", "us_governor"), ("Gavin Newsom", "Q979879", "us_governor"), ("Ron DeSantis", "Q20767779","us_governor"), ("Chris Christie", "Q1063986","us_governor"), ("Scott Walker", "Q2608", "us_governor"), ("Greg Abbott", "Q1543000","us_governor"), ("Brian Kemp", "Q16216063","us_governor"), # UK Prime Ministers ("Boris Johnson", "Q180589", "uk_pm"), ("Theresa May", "Q264766", "uk_pm"), ("David Cameron", "Q192", "uk_pm"), ("Gordon Brown", "Q3336", "uk_pm"), ("Tony Blair", "Q9545", "uk_pm"), ("John Major", "Q9624", "uk_pm"), ("Margaret Thatcher", "Q7174", "uk_pm"), ("Rishi Sunak", "Q109404461","uk_pm"), ("Liz Truss", "Q220359", "uk_pm"), ("Keir Starmer", "Q7671853","uk_pm"), # UK Politicians ("Jeremy Corbyn", "Q186630", "uk_politician"), ("Nigel Farage", "Q192893", "uk_politician"), ("Sadiq Khan", "Q3444954","uk_politician"), # European Leaders ("Angela Merkel", "Q567", "eu_leader"), ("Emmanuel Macron", "Q3052772","eu_leader"), ("Nicolas Sarkozy", "Q1631", "eu_leader"), ("François Hollande", "Q191360", "eu_leader"), ("Olaf Scholz", "Q61221", "eu_leader"), ("Gerhard Schroeder", "Q2530", "eu_leader"), ("Silvio Berlusconi", "Q11860", "eu_leader"), ("Mario Draghi", "Q172572", "eu_leader"), ("Pedro Sánchez", "Q58808", "eu_leader"), ("Mariano Rajoy", "Q156560", "eu_leader"), ("Justin Trudeau", "Q3099714","eu_leader"), ("Stephen Harper", "Q154454", "eu_leader"), ("Jean Chrétien", "Q179943", "eu_leader"), ("Vladimir Putin", "Q7747", "world_leader"), ("Volodymyr Zelenskyy", "Q4328088","world_leader"), ("Xi Jinping", "Q15180", "world_leader"), ("Narendra Modi", "Q1058583","world_leader"), ("Benjamin Netanyahu", "Q39455", "world_leader"), ("Recep Tayyip Erdoğan", "Q1058589","world_leader"), # US Judiciary ("John Roberts", "Q190126", "us_judiciary"), ("Ruth Bader Ginsburg", "Q17652", "us_judiciary"), ("Antonin Scalia", "Q186171", "us_judiciary"), ("Clarence Thomas", "Q186120", "us_judiciary"), ("Samuel Alito", "Q329134", "us_judiciary"), ("Sonia Sotomayor", "Q220813", "us_judiciary"), ("Elena Kagan", "Q220812", "us_judiciary"), ("Neil Gorsuch", "Q22248662","us_judiciary"), ("Brett Kavanaugh", "Q2713403","us_judiciary"), ("Amy Coney Barrett", "Q88491423","us_judiciary"), ("Ketanji Brown Jackson", "Q78783", "us_judiciary"), ("Stephen Breyer", "Q329185", "us_judiciary"), ("Anthony Kennedy", "Q329136", "us_judiciary"), ("David Souter", "Q329188", "us_judiciary"), ("Sandra Day O'Connor", "Q185743", "us_judiciary"), # Other notable US politicians ("Al Sharpton", "Q348522", "us_politician"), ("Jesse Jackson", "Q254987", "us_politician"), ("John Lewis", "Q957110", "us_politician"), ("Elijah Cummings", "Q1329428","us_politician"), ("Trey Gowdy", "Q1345735","us_politician"), ("Bennie Thompson", "Q1023040","us_politician"), ("Liz Cheney", "Q1631912","us_politician"), ("Adam Kinzinger", "Q3608736","us_politician"), ("Pete Buttigieg", "Q939894", "us_politician"), ("Eric Adams", "Q1328525","us_politician"), ("Andrew Yang", "Q17491546","us_politician"), ("Tulsi Gabbard", "Q16145726","us_politician"), ("Beto O'Rourke", "Q21994992","us_politician"), ("Stacey Abrams", "Q7597777","us_politician"), ("Raphael Warnock", "Q7293064","us_politician"), ("Jon Ossoff", "Q23039098","us_politician"), ("Doug Jones", "Q17496143","us_politician"), ("Roy Moore", "Q7374376","us_politician"), ("Luther Strange", "Q6706827","us_politician"), ("Jeff Sessions", "Q310847", "us_politician"), ("Eric Holder", "Q866030", "us_politician"), ("Loretta Lynch", "Q16960752","us_politician"), ("William Barr", "Q215082", "us_politician"), ("Merrick Garland", "Q360413", "us_politician"), ("Robert Mueller", "Q367626", "us_politician"), ("James Comey", "Q5028025","us_politician"), ("Andrew McCabe", "Q4760695","us_politician"), ] # section: wikidata fetch def fetch_entity_facts(name: str, qid: str) -> list[dict]: """Fetch all P39 (position held) facts for an entity from Wikidata.""" from backend.pipeline.verification.wikidata import WikidataClient client = WikidataClient() facts = client.get_temporal_facts(qid, relation_properties=["P39", "P463"]) result = [] for f in facts: entry = { "entity": name, "entity_id": qid, "position": f.value_label, "property": f.property_id, } if f.time_start: entry["start_year"] = f.time_start.year entry["start_date"] = f.time_start.strftime("%Y-%m-%d") if f.time_end: entry["end_year"] = f.time_end.year entry["end_date"] = f.time_end.strftime("%Y-%m-%d") if f.time_point: entry["point_year"] = f.time_point.year entry["point_date"] = f.time_point.strftime("%Y-%m-%d") result.append(entry) return result # section: main def main(args: argparse.Namespace) -> None: dry_run = args.dry_run merge = args.merge limit = args.limit entities = ENTITIES[:limit] if limit else ENTITIES print("\n" + "=" * 65) print(" BUILD EXTENDED REFERENCE KG") print(f" Entities : {len(entities)}") print(f" Dry run : {dry_run}") print(f" Merge : {merge}") print(f" Output : {OUTPUT_FILE}") print("=" * 65 + "\n") # load existing KG if merging existing: dict[str, list[dict]] = {} if merge and OUTPUT_FILE.exists(): with open(OUTPUT_FILE, "r", encoding="utf-8") as f: data = json.load(f) # index by entity name for entry in data.get("entities", []): existing[entry["name"]] = entry logger.info(f"Loaded {len(existing)} existing entities for merge.") # fetch facts for each entity all_entities = [] failed = [] log_entries = [] for i, (name, qid, category) in enumerate(entities, 1): logger.info(f"[{i:3}/{len(entities)}] {name} ({qid}) [{category}]") if dry_run: print(f" DRY RUN: would fetch {name} ({qid})") continue try: facts = fetch_entity_facts(name, qid) entity_entry = { "name": name, "qid": qid, "category": category, "facts": facts, "fetched_at": datetime.now().isoformat(), } all_entities.append(entity_entry) log_entries.append({ "name": name, "qid": qid, "facts_count": len(facts), "status": "ok", }) logger.info(f" -> {len(facts)} facts fetched") if not facts: logger.warning(f" -> no facts found for {name}") except Exception as e: logger.error(f" -> FAILED: {e}") failed.append({"name": name, "qid": qid, "error": str(e)}) log_entries.append({ "name": name, "qid": qid, "facts_count": 0, "status": "failed", "error": str(e), }) time.sleep(QUERY_DELAY) if dry_run: print(f"\nDry run complete. Would have fetched {len(entities)} entities.") return # merge with existing if requested if merge and existing: existing_names = {e["name"] for e in all_entities} for name, entry in existing.items(): if name not in existing_names: all_entities.append(entry) logger.info(f"Kept existing: {name}") # build output structure output = { "generated_at": datetime.now().isoformat(), "total_entities": len(all_entities), "total_facts": sum(len(e["facts"]) for e in all_entities), "failed_count": len(failed), "entities": all_entities, "failed": failed, } # save output with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) # save build log with open(LOG_FILE, "w", encoding="utf-8") as f: json.dump({ "generated_at": datetime.now().isoformat(), "entries": log_entries, "failed": failed, }, f, ensure_ascii=False, indent=2) # summary print("\n" + "=" * 65) print(" SUMMARY") print("=" * 65) print(f" Entities fetched : {len(all_entities)}") print(f" Total facts : {output['total_facts']}") print(f" Failed : {len(failed)}") print(f" Output : {OUTPUT_FILE}") print(f" Log : {LOG_FILE}") if failed: print(f"\n Failed entities:") for f in failed: print(f" - {f['name']} ({f['qid']}): {f['error']}") print("\n Top entities by fact count:") sorted_entities = sorted(all_entities, key=lambda e: len(e["facts"]), reverse=True) for e in sorted_entities[:10]: print(f" {e['name']:<35} {len(e['facts']):>3} facts") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Build Extended Reference KG from Wikidata") parser.add_argument("--dry-run", action="store_true", help="Show entities without fetching data") parser.add_argument("--merge", action="store_true", help="Merge with existing Reference KG (keep existing entities)") parser.add_argument("--limit", type=int, default=None, help="Limit number of entities (e.g. --limit 20 for quick test)") args = parser.parse_args() main(args)