FakeNews-XAI / scripts /build_reference_kg.py
Marius16's picture
Testes + refactor
f22b61b
Raw
History Blame Contribute Delete
15.9 kB
"""Build Extended Reference KG from Wikidata.
Downloads temporal facts (positions held, P39) for the top ~200 most notable
politicians and saves them to data/reference_kg/verified_events.json,
replacing or extending the existing 16-entity Reference KG.
The script queries Wikidata for:
- US Presidents and Vice Presidents
- US Senators and Representatives (most notable)
- Secretaries of State and Defense
- UK Prime Ministers
- EU/European leaders (Merkel, Macron, etc.)
- Other major world leaders
Usage:
python scripts/build_reference_kg.py
python scripts/build_reference_kg.py --dry-run # show entities, don't save
python scripts/build_reference_kg.py --merge # merge with existing KG
Output:
data/reference_kg/verified_events.json
data/reference_kg/build_log_YYYY-MM-DD.json
"""
from __future__ import annotations
import argparse
import json
import logging
import time
from datetime import datetime
from pathlib import Path
import sys
_PROJECT_ROOT = Path(__file__).parent.parent
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)-30s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("build_reference_kg")
OUTPUT_DIR = _PROJECT_ROOT / "data" / "reference_kg"
OUTPUT_FILE = OUTPUT_DIR / "verified_events.json"
LOG_FILE = OUTPUT_DIR / f"build_log_{datetime.now().strftime('%Y-%m-%d')}.json"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# delay between Wikidata queries to avoid rate limiting
QUERY_DELAY = 1.5
# section: entity list
# Format: (name, wikidata_QID, category)
# QIDs verified manually from Wikidata
ENTITIES = [
# US Presidents
("Barack Obama", "Q76", "us_president"),
("Donald Trump", "Q22686", "us_president"),
("Joe Biden", "Q6279", "us_president"),
("Bill Clinton", "Q1124", "us_president"),
("George W. Bush", "Q207", "us_president"),
("George H.W. Bush", "Q23505", "us_president"),
("Ronald Reagan", "Q9960", "us_president"),
("Jimmy Carter", "Q23685", "us_president"),
("Gerald Ford", "Q9916", "us_president"),
("Richard Nixon", "Q9588", "us_president"),
("Lyndon B. Johnson", "Q9640", "us_president"),
("John F. Kennedy", "Q9696", "us_president"),
("Dwight Eisenhower", "Q9916", "us_president"),
("Harry S. Truman", "Q9682", "us_president"),
# US Vice Presidents
("Mike Pence", "Q373472", "us_vp"),
("Dick Cheney", "Q48259", "us_vp"),
("Al Gore", "Q19673", "us_vp"),
("Dan Quayle", "Q48305", "us_vp"),
("Walter Mondale", "Q223779", "us_vp"),
("Kamala Harris", "Q185530", "us_vp"),
# US Senators (notable)
("Hillary Clinton", "Q6294", "us_senator"),
("John McCain", "Q10390", "us_senator"),
("Bernie Sanders", "Q359442", "us_senator"),
("Elizabeth Warren", "Q1173433","us_senator"),
("Mitch McConnell", "Q355522", "us_senator"),
("Chuck Schumer", "Q380900", "us_senator"),
("Marco Rubio", "Q349455", "us_senator"),
("Ted Cruz", "Q905014", "us_senator"),
("Rand Paul", "Q671426", "us_senator"),
("Lindsey Graham", "Q378702", "us_senator"),
("Amy Klobuchar", "Q23714", "us_senator"),
("Cory Booker", "Q922820", "us_senator"),
("Bob Menendez", "Q539566", "us_senator"),
("Dianne Feinstein", "Q231033", "us_senator"),
("Susan Collins", "Q276932", "us_senator"),
("Lisa Murkowski", "Q230539", "us_senator"),
# US House Representatives (notable)
("Nancy Pelosi", "Q170581", "us_representative"),
("Kevin McCarthy", "Q977592", "us_representative"),
("Paul Ryan", "Q381812", "us_representative"),
("John Boehner", "Q310461", "us_representative"),
("Newt Gingrich", "Q313756", "us_representative"),
("Alexandria Ocasio-Cortez","Q55223754","us_representative"),
("Ilhan Omar", "Q55405334","us_representative"),
("Matt Gaetz", "Q24834954","us_representative"),
("Jim Jordan", "Q1689040", "us_representative"),
("Adam Schiff", "Q350843", "us_representative"),
("Jerry Nadler", "Q555961", "us_representative"),
("Maxine Waters", "Q234302", "us_representative"),
("Steny Hoyer", "Q558566", "us_representative"),
("Eric Cantor", "Q726155", "us_representative"),
# US Secretaries of State / Defense
("John Kerry", "Q33866", "us_cabinet"),
("Rex Tillerson", "Q7313083","us_cabinet"),
("Mike Pompeo", "Q974374", "us_cabinet"),
("Antony Blinken", "Q720301", "us_cabinet"),
("Condoleezza Rice", "Q62398", "us_cabinet"),
("Colin Powell", "Q34296", "us_cabinet"),
("Madeleine Albright", "Q41225", "us_cabinet"),
("Robert Gates", "Q312930", "us_cabinet"),
("Donald Rumsfeld", "Q48259", "us_cabinet"),
("Lloyd Austin", "Q6664165","us_cabinet"),
# US Governors (notable)
("Mitt Romney", "Q1036", "us_governor"),
("Arnold Schwarzenegger", "Q2685", "us_governor"),
("Andrew Cuomo", "Q312528", "us_governor"),
("Gavin Newsom", "Q979879", "us_governor"),
("Ron DeSantis", "Q20767779","us_governor"),
("Chris Christie", "Q1063986","us_governor"),
("Scott Walker", "Q2608", "us_governor"),
("Greg Abbott", "Q1543000","us_governor"),
("Brian Kemp", "Q16216063","us_governor"),
# UK Prime Ministers
("Boris Johnson", "Q180589", "uk_pm"),
("Theresa May", "Q264766", "uk_pm"),
("David Cameron", "Q192", "uk_pm"),
("Gordon Brown", "Q3336", "uk_pm"),
("Tony Blair", "Q9545", "uk_pm"),
("John Major", "Q9624", "uk_pm"),
("Margaret Thatcher", "Q7174", "uk_pm"),
("Rishi Sunak", "Q109404461","uk_pm"),
("Liz Truss", "Q220359", "uk_pm"),
("Keir Starmer", "Q7671853","uk_pm"),
# UK Politicians
("Jeremy Corbyn", "Q186630", "uk_politician"),
("Nigel Farage", "Q192893", "uk_politician"),
("Sadiq Khan", "Q3444954","uk_politician"),
# European Leaders
("Angela Merkel", "Q567", "eu_leader"),
("Emmanuel Macron", "Q3052772","eu_leader"),
("Nicolas Sarkozy", "Q1631", "eu_leader"),
("François Hollande", "Q191360", "eu_leader"),
("Olaf Scholz", "Q61221", "eu_leader"),
("Gerhard Schroeder", "Q2530", "eu_leader"),
("Silvio Berlusconi", "Q11860", "eu_leader"),
("Mario Draghi", "Q172572", "eu_leader"),
("Pedro Sánchez", "Q58808", "eu_leader"),
("Mariano Rajoy", "Q156560", "eu_leader"),
("Justin Trudeau", "Q3099714","eu_leader"),
("Stephen Harper", "Q154454", "eu_leader"),
("Jean Chrétien", "Q179943", "eu_leader"),
("Vladimir Putin", "Q7747", "world_leader"),
("Volodymyr Zelenskyy", "Q4328088","world_leader"),
("Xi Jinping", "Q15180", "world_leader"),
("Narendra Modi", "Q1058583","world_leader"),
("Benjamin Netanyahu", "Q39455", "world_leader"),
("Recep Tayyip Erdoğan", "Q1058589","world_leader"),
# US Judiciary
("John Roberts", "Q190126", "us_judiciary"),
("Ruth Bader Ginsburg", "Q17652", "us_judiciary"),
("Antonin Scalia", "Q186171", "us_judiciary"),
("Clarence Thomas", "Q186120", "us_judiciary"),
("Samuel Alito", "Q329134", "us_judiciary"),
("Sonia Sotomayor", "Q220813", "us_judiciary"),
("Elena Kagan", "Q220812", "us_judiciary"),
("Neil Gorsuch", "Q22248662","us_judiciary"),
("Brett Kavanaugh", "Q2713403","us_judiciary"),
("Amy Coney Barrett", "Q88491423","us_judiciary"),
("Ketanji Brown Jackson", "Q78783", "us_judiciary"),
("Stephen Breyer", "Q329185", "us_judiciary"),
("Anthony Kennedy", "Q329136", "us_judiciary"),
("David Souter", "Q329188", "us_judiciary"),
("Sandra Day O'Connor", "Q185743", "us_judiciary"),
# Other notable US politicians
("Al Sharpton", "Q348522", "us_politician"),
("Jesse Jackson", "Q254987", "us_politician"),
("John Lewis", "Q957110", "us_politician"),
("Elijah Cummings", "Q1329428","us_politician"),
("Trey Gowdy", "Q1345735","us_politician"),
("Bennie Thompson", "Q1023040","us_politician"),
("Liz Cheney", "Q1631912","us_politician"),
("Adam Kinzinger", "Q3608736","us_politician"),
("Pete Buttigieg", "Q939894", "us_politician"),
("Eric Adams", "Q1328525","us_politician"),
("Andrew Yang", "Q17491546","us_politician"),
("Tulsi Gabbard", "Q16145726","us_politician"),
("Beto O'Rourke", "Q21994992","us_politician"),
("Stacey Abrams", "Q7597777","us_politician"),
("Raphael Warnock", "Q7293064","us_politician"),
("Jon Ossoff", "Q23039098","us_politician"),
("Doug Jones", "Q17496143","us_politician"),
("Roy Moore", "Q7374376","us_politician"),
("Luther Strange", "Q6706827","us_politician"),
("Jeff Sessions", "Q310847", "us_politician"),
("Eric Holder", "Q866030", "us_politician"),
("Loretta Lynch", "Q16960752","us_politician"),
("William Barr", "Q215082", "us_politician"),
("Merrick Garland", "Q360413", "us_politician"),
("Robert Mueller", "Q367626", "us_politician"),
("James Comey", "Q5028025","us_politician"),
("Andrew McCabe", "Q4760695","us_politician"),
]
# section: wikidata fetch
def fetch_entity_facts(name: str, qid: str) -> list[dict]:
"""Fetch all P39 (position held) facts for an entity from Wikidata."""
from backend.pipeline.verification.wikidata import WikidataClient
client = WikidataClient()
facts = client.get_temporal_facts(qid, relation_properties=["P39", "P463"])
result = []
for f in facts:
entry = {
"entity": name,
"entity_id": qid,
"position": f.value_label,
"property": f.property_id,
}
if f.time_start:
entry["start_year"] = f.time_start.year
entry["start_date"] = f.time_start.strftime("%Y-%m-%d")
if f.time_end:
entry["end_year"] = f.time_end.year
entry["end_date"] = f.time_end.strftime("%Y-%m-%d")
if f.time_point:
entry["point_year"] = f.time_point.year
entry["point_date"] = f.time_point.strftime("%Y-%m-%d")
result.append(entry)
return result
# section: main
def main(args: argparse.Namespace) -> None:
dry_run = args.dry_run
merge = args.merge
limit = args.limit
entities = ENTITIES[:limit] if limit else ENTITIES
print("\n" + "=" * 65)
print(" BUILD EXTENDED REFERENCE KG")
print(f" Entities : {len(entities)}")
print(f" Dry run : {dry_run}")
print(f" Merge : {merge}")
print(f" Output : {OUTPUT_FILE}")
print("=" * 65 + "\n")
# load existing KG if merging
existing: dict[str, list[dict]] = {}
if merge and OUTPUT_FILE.exists():
with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
data = json.load(f)
# index by entity name
for entry in data.get("entities", []):
existing[entry["name"]] = entry
logger.info(f"Loaded {len(existing)} existing entities for merge.")
# fetch facts for each entity
all_entities = []
failed = []
log_entries = []
for i, (name, qid, category) in enumerate(entities, 1):
logger.info(f"[{i:3}/{len(entities)}] {name} ({qid}) [{category}]")
if dry_run:
print(f" DRY RUN: would fetch {name} ({qid})")
continue
try:
facts = fetch_entity_facts(name, qid)
entity_entry = {
"name": name,
"qid": qid,
"category": category,
"facts": facts,
"fetched_at": datetime.now().isoformat(),
}
all_entities.append(entity_entry)
log_entries.append({
"name": name, "qid": qid,
"facts_count": len(facts), "status": "ok",
})
logger.info(f" -> {len(facts)} facts fetched")
if not facts:
logger.warning(f" -> no facts found for {name}")
except Exception as e:
logger.error(f" -> FAILED: {e}")
failed.append({"name": name, "qid": qid, "error": str(e)})
log_entries.append({
"name": name, "qid": qid,
"facts_count": 0, "status": "failed", "error": str(e),
})
time.sleep(QUERY_DELAY)
if dry_run:
print(f"\nDry run complete. Would have fetched {len(entities)} entities.")
return
# merge with existing if requested
if merge and existing:
existing_names = {e["name"] for e in all_entities}
for name, entry in existing.items():
if name not in existing_names:
all_entities.append(entry)
logger.info(f"Kept existing: {name}")
# build output structure
output = {
"generated_at": datetime.now().isoformat(),
"total_entities": len(all_entities),
"total_facts": sum(len(e["facts"]) for e in all_entities),
"failed_count": len(failed),
"entities": all_entities,
"failed": failed,
}
# save output
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(output, f, ensure_ascii=False, indent=2)
# save build log
with open(LOG_FILE, "w", encoding="utf-8") as f:
json.dump({
"generated_at": datetime.now().isoformat(),
"entries": log_entries,
"failed": failed,
}, f, ensure_ascii=False, indent=2)
# summary
print("\n" + "=" * 65)
print(" SUMMARY")
print("=" * 65)
print(f" Entities fetched : {len(all_entities)}")
print(f" Total facts : {output['total_facts']}")
print(f" Failed : {len(failed)}")
print(f" Output : {OUTPUT_FILE}")
print(f" Log : {LOG_FILE}")
if failed:
print(f"\n Failed entities:")
for f in failed:
print(f" - {f['name']} ({f['qid']}): {f['error']}")
print("\n Top entities by fact count:")
sorted_entities = sorted(all_entities, key=lambda e: len(e["facts"]), reverse=True)
for e in sorted_entities[:10]:
print(f" {e['name']:<35} {len(e['facts']):>3} facts")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build Extended Reference KG from Wikidata")
parser.add_argument("--dry-run", action="store_true",
help="Show entities without fetching data")
parser.add_argument("--merge", action="store_true",
help="Merge with existing Reference KG (keep existing entities)")
parser.add_argument("--limit", type=int, default=None,
help="Limit number of entities (e.g. --limit 20 for quick test)")
args = parser.parse_args()
main(args)