File size: 2,256 Bytes
1004967
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
"""
Convert a raw triples JSON cache (e.g. alex_rivera_raw.json) to a
NetworkX MultiDiGraph and save it as a .gpickle file.

Usage:
    python scripts/convert_raw_triples.py --persona alex_rivera

No LLM or langchain dependencies — only needs networkx.
"""

import argparse
import json
import pickle
from pathlib import Path

import networkx as nx


def build_graph(triples: list[dict]) -> nx.MultiDiGraph:
    """
    Build a MultiDiGraph from a list of {subject, relation, object, confidence} dicts.
    Deduplicates exact (s, r, o) triplets, keeping the highest-confidence instance.
    """
    best: dict[tuple, dict] = {}
    for t in triples:
        key = (t["subject"], t["relation"], t["object"])
        if t["confidence"] > best.get(key, {}).get("confidence", -1.0):
            best[key] = t

    graph = nx.MultiDiGraph()
    for (s, r, o), t in best.items():
        graph.add_node(s)
        graph.add_node(o)
        graph.add_edge(s, o, relation=r, confidence=t["confidence"])

    return graph


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--persona", default="alex_rivera")
    parser.add_argument("--kg-dir", default="data/knowledge_graphs")
    args = parser.parse_args()

    kg_dir = Path(args.kg_dir)
    raw_path = kg_dir / f"{args.persona}_raw.json"
    out_path = kg_dir / f"{args.persona}.gpickle"

    if not raw_path.exists():
        print(f"ERROR: {raw_path} not found. Run build_kg.py first to generate it.")
        return

    print(f"Loading {raw_path} ...")
    with raw_path.open(encoding="utf-8") as fh:
        triples = json.load(fh)
    print(f"  {len(triples)} raw triples loaded")

    graph = build_graph(triples)
    print(f"  Nodes : {graph.number_of_nodes()}")
    print(f"  Edges : {graph.number_of_edges()}")

    # Show a sample of edges
    print("\nSample edges:")
    for i, (s, o, data) in enumerate(graph.edges(data=True)):
        print(f"  {s} --[{data['relation']}]--> {o}  (conf={data['confidence']:.2f})")
        if i >= 9:
            print(f"  ... ({graph.number_of_edges() - 10} more)")
            break

    with out_path.open("wb") as fh:
        pickle.dump(graph, fh)
    print(f"\nSaved to {out_path}")


if __name__ == "__main__":
    main()