memorybridge / scripts /convert_raw_triples.py
kimandrew927's picture
Initial Space deployment
1004967
"""
Convert a raw triples JSON cache (e.g. alex_rivera_raw.json) to a
NetworkX MultiDiGraph and save it as a .gpickle file.
Usage:
python scripts/convert_raw_triples.py --persona alex_rivera
No LLM or langchain dependencies — only needs networkx.
"""
import argparse
import json
import pickle
from pathlib import Path
import networkx as nx
def build_graph(triples: list[dict]) -> nx.MultiDiGraph:
"""
Build a MultiDiGraph from a list of {subject, relation, object, confidence} dicts.
Deduplicates exact (s, r, o) triplets, keeping the highest-confidence instance.
"""
best: dict[tuple, dict] = {}
for t in triples:
key = (t["subject"], t["relation"], t["object"])
if t["confidence"] > best.get(key, {}).get("confidence", -1.0):
best[key] = t
graph = nx.MultiDiGraph()
for (s, r, o), t in best.items():
graph.add_node(s)
graph.add_node(o)
graph.add_edge(s, o, relation=r, confidence=t["confidence"])
return graph
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--persona", default="alex_rivera")
parser.add_argument("--kg-dir", default="data/knowledge_graphs")
args = parser.parse_args()
kg_dir = Path(args.kg_dir)
raw_path = kg_dir / f"{args.persona}_raw.json"
out_path = kg_dir / f"{args.persona}.gpickle"
if not raw_path.exists():
print(f"ERROR: {raw_path} not found. Run build_kg.py first to generate it.")
return
print(f"Loading {raw_path} ...")
with raw_path.open(encoding="utf-8") as fh:
triples = json.load(fh)
print(f" {len(triples)} raw triples loaded")
graph = build_graph(triples)
print(f" Nodes : {graph.number_of_nodes()}")
print(f" Edges : {graph.number_of_edges()}")
# Show a sample of edges
print("\nSample edges:")
for i, (s, o, data) in enumerate(graph.edges(data=True)):
print(f" {s} --[{data['relation']}]--> {o} (conf={data['confidence']:.2f})")
if i >= 9:
print(f" ... ({graph.number_of_edges() - 10} more)")
break
with out_path.open("wb") as fh:
pickle.dump(graph, fh)
print(f"\nSaved to {out_path}")
if __name__ == "__main__":
main()