FakeNews-XAI / scripts /update_reference_kg.py
Marius16's picture
RSS verification panel, benchmark split (100/150), remove Ollama/REBEL from frontend, sync Qwen3 labels
8728df8
Raw
History Blame Contribute Delete
10.3 kB
"""Adaugă entități noi în verified_events.json fără să șteargă cele existente.
Rulare:
python scripts/update_reference_kg.py
"""
from __future__ import annotations
import json
import logging
import sys
import time
from datetime import datetime
from pathlib import Path
_PROJECT_ROOT = Path(__file__).parent.parent
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("update_reference_kg")
OUTPUT_FILE = _PROJECT_ROOT / "data" / "reference_kg" / "verified_events.json"
QUERY_DELAY = 1.5
NEW_ENTITIES = [
# UK Politicians
{"name": "Keir Starmer", "qid": "Q23497151", "category": "uk_politician"},
{"name": "Tony Blair", "qid": "Q9682", "category": "uk_politician"},
{"name": "Rishi Sunak", "qid": "Q63455796", "category": "uk_politician"},
{"name": "Boris Johnson", "qid": "Q327407", "category": "uk_politician"},
{"name": "Theresa May", "qid": "Q264766", "category": "uk_politician"},
{"name": "Gordon Brown", "qid": "Q3936", "category": "uk_politician"},
{"name": "Wes Streeting", "qid": "Q7983907", "category": "uk_politician"},
{"name": "Andy Burnham", "qid": "Q4759326", "category": "uk_politician"},
{"name": "Nicola Sturgeon", "qid": "Q329938", "category": "uk_politician"},
{"name": "Margaret Thatcher", "qid": "Q7837", "category": "uk_politician"},
# EU Politicians
{"name": "Donald Tusk", "qid": "Q946", "category": "eu_politician"},
{"name": "Emmanuel Macron", "qid": "Q3052772", "category": "eu_politician"},
{"name": "Olaf Scholz", "qid": "Q61053", "category": "eu_politician"},
{"name": "Ursula von der Leyen", "qid": "Q21023677", "category": "eu_politician"},
# US Politicians
{"name": "Ken Paxton", "qid": "Q7383626", "category": "us_politician"},
# US Presidents — historical
{"name": "Richard Nixon", "qid": "Q9588", "category": "us_president"},
{"name": "Gerald Ford", "qid": "Q9582", "category": "us_president"},
{"name": "Lyndon B. Johnson", "qid": "Q9597", "category": "us_president"},
{"name": "Dwight Eisenhower", "qid": "Q9586", "category": "us_president"},
{"name": "John F. Kennedy", "qid": "Q9574", "category": "us_president"},
{"name": "Harry Truman", "qid": "Q9583", "category": "us_president"},
]
def _wikidata_facts_to_entries(name: str, qid: str, wikidata_facts: list) -> list[dict]:
"""Convertește WikidataFact-uri în formatul reference KG, filtrând datele eronate."""
result = []
for wf in wikidata_facts:
start_year = wf.time_start.year if wf.time_start else None
# Fix P569: filtrează faptele cu date eronate din Wikidata
if start_year is not None and start_year < 1900:
logger.debug(f" skip '{wf.value_label}' — start_year={start_year} < 1900")
continue
entry: dict = {
"entity": name,
"entity_id": qid,
"position": wf.value_label,
"property": wf.property_id,
}
if wf.time_start:
entry["start_year"] = wf.time_start.year
entry["start_date"] = wf.time_start.strftime("%Y-%m-%d")
if wf.time_end:
entry["end_year"] = wf.time_end.year
entry["end_date"] = wf.time_end.strftime("%Y-%m-%d")
if wf.time_point:
entry["point_year"] = wf.time_point.year
entry["point_date"] = wf.time_point.strftime("%Y-%m-%d")
result.append(entry)
return result
def fetch_facts(name: str, qid: str) -> tuple[list[dict], str]:
"""Fetch P39 + P463 din Wikidata. Returnează (fapte, qid_folosit).
Dacă QID-ul dat returnează 0 fapte, încearcă automat search_entity_full(name)
și folosește primul rezultat cu QID diferit.
"""
from backend.pipeline.verification.wikidata import WikidataClient
client = WikidataClient()
raw_facts = client.get_temporal_facts(qid, relation_properties=["P39", "P463"])
if not raw_facts:
logger.warning(f" 0 fapte pentru {name} ({qid}) — încearcă search_entity_full")
time.sleep(QUERY_DELAY)
search_results = client.search_entity_full(name)
if search_results:
candidate = search_results[0]
candidate_qid = candidate["id"]
candidate_label = candidate.get("label", "")
candidate_desc = candidate.get("description", "")
if candidate_qid != qid:
logger.info(
f" QID corectat: {qid} -> {candidate_qid} "
f"({candidate_label}{candidate_desc})"
)
time.sleep(QUERY_DELAY)
raw_facts = client.get_temporal_facts(
candidate_qid, relation_properties=["P39", "P463"]
)
if raw_facts:
qid = candidate_qid
logger.info(f" -> {len(raw_facts)} fapte cu QID corectat")
else:
logger.warning(f" QID corectat {candidate_qid} returnează tot 0 fapte")
else:
logger.warning(f" search_entity_full a returnat același QID {qid} — 0 fapte finale")
else:
logger.warning(f" search_entity_full nu a găsit rezultate pentru '{name}'")
facts = _wikidata_facts_to_entries(name, qid, raw_facts)
return facts, qid
def main() -> None:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"--refetch-empty",
action="store_true",
help="Re-fetch fapte pentru entitati cu 0 fapte in KG",
)
args = parser.parse_args()
# 1. Încarcă KG-ul existent
if not OUTPUT_FILE.exists():
logger.error(f"Fisierul {OUTPUT_FILE} nu exista. Ruleaza build_reference_kg.py intai.")
sys.exit(1)
with open(OUTPUT_FILE, encoding="utf-8") as f:
data = json.load(f)
entities: list[dict] = data.get("entities", [])
# 2. Index după nume (case-insensitive)
existing_names = {e["name"].lower() for e in entities}
print(f"\nKG existent: {len(entities)} entitati, {data.get('total_facts', '?')} fapte")
# Modul --refetch-empty: re-fetch entitati cu 0 fapte
if args.refetch_empty:
empty = [e for e in entities if len(e.get("facts", [])) == 0]
print(f"Entitati cu 0 fapte: {len(empty)}\n")
updated_count = 0
updated_facts = 0
for entity in empty:
name = entity["name"]
qid = entity["qid"]
logger.info(f" Refetch: {name} ({qid})")
try:
facts, used_qid = fetch_facts(name, qid)
except Exception as e:
logger.warning(f" EROARE la {name}: {e}")
time.sleep(QUERY_DELAY)
continue
if facts:
entity["facts"] = facts
entity["qid"] = used_qid
entity["fetched_at"] = datetime.now().isoformat()
updated_count += 1
updated_facts += len(facts)
logger.info(f" UPDATED: {name} -> {len(facts)} fapte noi")
else:
logger.info(f" NO DATA: {name} — 0 fapte si dupa retry")
time.sleep(QUERY_DELAY)
total_facts = sum(len(e.get("facts", [])) for e in entities)
data["entities"] = entities
data["total_entities"] = len(entities)
data["total_facts"] = total_facts
data["generated_at"] = datetime.now().isoformat()
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\nRezultat --refetch-empty:")
print(f" Entitati actualizate : {updated_count} / {len(empty)}")
print(f" Fapte noi total : {updated_facts}")
print(f" Total fapte in KG : {total_facts}")
print(f" Salvat in : {OUTPUT_FILE}\n")
return
print(f"Entitati noi de procesat: {len(NEW_ENTITIES)}\n")
added_entities = 0
added_facts = 0
skipped = 0
# 3. Adaugă entitățile noi
for entity in NEW_ENTITIES:
name = entity["name"]
qid = entity["qid"]
category = entity["category"]
if name.lower() in existing_names:
logger.info(f" SKIP (exista deja): {name}")
skipped += 1
continue
logger.info(f" Fetch: {name} ({qid}) [{category}]")
try:
facts, used_qid = fetch_facts(name, qid)
except Exception as e:
logger.warning(f" EROARE la {name}: {e}")
continue
if used_qid != qid:
logger.info(f" QID salvat in KG: {used_qid} (corectat de la {qid})")
entities.append({
"name": name,
"qid": used_qid,
"category": category,
"facts": facts,
"fetched_at": datetime.now().isoformat(),
})
existing_names.add(name.lower())
added_entities += 1
added_facts += len(facts)
logger.info(f" -> {len(facts)} fapte adaugate pentru {name}")
time.sleep(QUERY_DELAY)
# 4. Actualizează metadatele
total_facts = sum(len(e.get("facts", [])) for e in entities)
data["entities"] = entities
data["total_entities"] = len(entities)
data["total_facts"] = total_facts
data["generated_at"] = datetime.now().isoformat()
# 5. Salvează
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"\nRezultat:")
print(f" Entitati noi adaugate : {added_entities}")
print(f" Fapte noi adaugate : {added_facts}")
print(f" Sarite (deja exista) : {skipped}")
print(f" Total entitati in KG : {len(entities)}")
print(f" Total fapte in KG : {total_facts}")
print(f" Salvat in : {OUTPUT_FILE}\n")
if __name__ == "__main__":
main()