""" Convert a raw triples JSON cache (e.g. alex_rivera_raw.json) to a NetworkX MultiDiGraph and save it as a .gpickle file. Usage: python scripts/convert_raw_triples.py --persona alex_rivera No LLM or langchain dependencies — only needs networkx. """ import argparse import json import pickle from pathlib import Path import networkx as nx def build_graph(triples: list[dict]) -> nx.MultiDiGraph: """ Build a MultiDiGraph from a list of {subject, relation, object, confidence} dicts. Deduplicates exact (s, r, o) triplets, keeping the highest-confidence instance. """ best: dict[tuple, dict] = {} for t in triples: key = (t["subject"], t["relation"], t["object"]) if t["confidence"] > best.get(key, {}).get("confidence", -1.0): best[key] = t graph = nx.MultiDiGraph() for (s, r, o), t in best.items(): graph.add_node(s) graph.add_node(o) graph.add_edge(s, o, relation=r, confidence=t["confidence"]) return graph def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--persona", default="alex_rivera") parser.add_argument("--kg-dir", default="data/knowledge_graphs") args = parser.parse_args() kg_dir = Path(args.kg_dir) raw_path = kg_dir / f"{args.persona}_raw.json" out_path = kg_dir / f"{args.persona}.gpickle" if not raw_path.exists(): print(f"ERROR: {raw_path} not found. Run build_kg.py first to generate it.") return print(f"Loading {raw_path} ...") with raw_path.open(encoding="utf-8") as fh: triples = json.load(fh) print(f" {len(triples)} raw triples loaded") graph = build_graph(triples) print(f" Nodes : {graph.number_of_nodes()}") print(f" Edges : {graph.number_of_edges()}") # Show a sample of edges print("\nSample edges:") for i, (s, o, data) in enumerate(graph.edges(data=True)): print(f" {s} --[{data['relation']}]--> {o} (conf={data['confidence']:.2f})") if i >= 9: print(f" ... ({graph.number_of_edges() - 10} more)") break with out_path.open("wb") as fh: pickle.dump(graph, fh) print(f"\nSaved to {out_path}") if __name__ == "__main__": main()