ragtiquicIA / scripts /export_concept_graph.py
Santiago Casas
helper scriprs
9c6fc7f
#!/usr/bin/env python3
"""Export a concept co-occurrence graph from SurrealDB.
Outputs (default under ./exports):
- concept_graph.json: Sigma-friendly graph JSON
- concept_graph.html: self-contained HTML viewer (vis-network inlined)
Edges are co-occurrence within the same chunk:
If a chunk mentions concepts A and B, edge(A,B) += 1.
"""
from __future__ import annotations
import argparse
import itertools
import json
import math
import os
import random
import urllib.request
from pathlib import Path
from typing import Any
from surrealdb import Surreal
def _query_rows(
conn: Surreal, surql: str, vars: dict[str, Any] | None = None
) -> list[dict[str, Any]]:
res = conn.query(surql, vars or {})
if not isinstance(res, list):
return []
return [r for r in res if isinstance(r, dict)]
def _as_str_id(value: Any) -> str:
return str(value)
def _linear_scale(
value: float, vmin: float, vmax: float, out_min: float, out_max: float
) -> float:
if vmax <= vmin:
return out_min
t = (value - vmin) / (vmax - vmin)
t = max(0.0, min(1.0, t))
return out_min + t * (out_max - out_min)
def _fetch_text(url: str, timeout_s: int = 20) -> str:
with urllib.request.urlopen(url, timeout=timeout_s) as resp:
return resp.read().decode("utf-8", errors="replace")
def _write_html_viewer(output_html: Path, payload: dict[str, Any], title: str) -> None:
"""Write a self-contained HTML viewer using vis-network.
We download the JS/CSS once at export time and inline it to avoid relying on
runtime CDN access.
"""
css_url = (
"https://cdn.jsdelivr.net/npm/vis-network@9.1.9/styles/vis-network.min.css"
)
js_url = "https://cdn.jsdelivr.net/npm/vis-network@9.1.9/standalone/umd/vis-network.min.js"
try:
vis_css = _fetch_text(css_url)
vis_js = _fetch_text(js_url)
except Exception as exc:
output_html.write_text(
"\n".join(
[
"<!doctype html>",
'<meta charset="utf-8" />',
f"<title>{title}</title>",
"<pre>",
"Failed to download vis-network assets while generating HTML.",
f"Error: {exc}",
"</pre>",
]
),
encoding="utf-8",
)
return
data_json = json.dumps(payload, ensure_ascii=False)
# f-string + braces => double braces for literal '{' in JS/CSS.
html = f"""<!doctype html>
<html>
<head>
<meta charset="utf-8" />
<title>{title}</title>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<style>
html, body {{ height: 100%; margin: 0; overflow: hidden; font-family: system-ui, -apple-system, Segoe UI, sans-serif; background:#0b1220; color:#e5e7eb; }}
#app {{ position: fixed; inset: 0; display: flex; flex-direction: column; }}
header {{ flex: 0 0 auto; padding: 10px 12px; border-bottom: 1px solid rgba(148,163,184,.25); display:flex; gap:12px; align-items:baseline; }}
header h1 {{ font-size: 14px; margin: 0; font-weight: 650; letter-spacing: .2px; }}
header .meta {{ color: rgba(226,232,240,.75); font-size: 12px; }}
#network {{ flex: 1 1 auto; min-height: 0; width: 100%; }}
#error {{ position: fixed; left: 0; right: 0; bottom: 0; max-height: 40%; overflow: auto; padding: 10px 12px; white-space: pre-wrap; color: #fecaca; background: rgba(15, 23, 42, .85); border-top: 1px solid rgba(148,163,184,.25); }}
</style>
<style>
{vis_css}
</style>
</head>
<body>
<div id="app">
<header>
<h1>{title}</h1>
<div class="meta">drag nodes, scroll to zoom, click node to focus</div>
</header>
<div id="network"></div>
</div>
<div id="error"></div>
<script>
{vis_js}
</script>
<script>
const payload = {data_json};
const elErr = document.getElementById('error');
try {{
const nodes = payload.nodes.map(n => ({{
id: n.id,
label: n.label,
value: Math.max(1, n.size || 1),
title: (n.label + "\\nfrequency: " + (n.frequency ?? '')),
color: {{ background: '#2563eb', border: '#93c5fd' }},
font: {{ color: '#e5e7eb' }},
}}));
const edges = payload.edges.map(e => ({{
id: e.id,
from: e.source,
to: e.target,
value: e.weight || 1,
title: ("weight: " + (e.weight || 1)),
color: {{ color: 'rgba(148,163,184,.55)' }},
smooth: {{ type: 'dynamic' }},
}}));
const container = document.getElementById('network');
const data = {{ nodes: new vis.DataSet(nodes), edges: new vis.DataSet(edges) }};
const options = {{
nodes: {{ shape: 'dot', scaling: {{ min: 6, max: 28 }} }},
edges: {{ scaling: {{ min: 0.2, max: 4 }} }},
interaction: {{ hover: true, tooltipDelay: 120 }},
physics: {{ stabilization: {{ iterations: 200, fit: true }}, barnesHut: {{ gravitationalConstant: -12000, springLength: 110 }} }},
}};
const network = new vis.Network(container, data, options);
network.once('stabilizationIterationsDone', () => {{
try {{ network.fit({{ animation: false }}); }} catch (e) {{}}
}});
network.on('click', params => {{
if (params.nodes.length) network.focus(params.nodes[0], {{ scale: 1.2, animation: {{ duration: 250 }} }});
}});
}} catch (e) {{
elErr.textContent = String(e && e.stack ? e.stack : e);
}}
</script>
</body>
</html>
"""
output_html.write_text(html, encoding="utf-8")
def _top_concepts(conn: Surreal, top: int) -> list[dict[str, Any]]:
# Prefer derived frequency from edge table; concept table doesn't reliably
# have a frequency field in this dataset.
return _query_rows(
conn,
"""
SELECT out, out.id AS id, out.content AS name, count() AS frequency
FROM MENTIONS_CONCEPT
GROUP BY out
ORDER BY frequency DESC
LIMIT $top
""",
{"top": top},
)
def main() -> int:
parser = argparse.ArgumentParser(description="Export concept graph from SurrealDB")
parser.add_argument("--top", type=int, default=50, help="Top concepts to include")
parser.add_argument("--min-weight", type=int, default=1, help="Minimum edge weight")
parser.add_argument(
"--max-edges", type=int, default=500, help="Maximum edges output"
)
parser.add_argument(
"--max-concepts-per-chunk",
type=int,
default=25,
help="Cap concepts per chunk for co-occurrence",
)
parser.add_argument("--output-dir", default="exports", help="Output directory")
parser.add_argument(
"--db-url",
default=os.getenv("KG_DB_URL", "ws://localhost:8000/rpc"),
help="SurrealDB WS URL",
)
parser.add_argument(
"--db-name",
default=os.getenv("DB_NAME", "test_db"),
help="SurrealDB database name",
)
parser.add_argument(
"--namespace",
default=os.getenv("DB_NS", "kaig"),
help="SurrealDB namespace",
)
parser.add_argument("--username", default=os.getenv("DB_USER", "root"))
parser.add_argument("--password", default=os.getenv("DB_PASS", "root"))
parser.add_argument("--seed", type=int, default=7, help="Layout seed")
args = parser.parse_args()
out_dir = Path(args.output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
out_json = out_dir / "concept_graph.json"
out_html = out_dir / "concept_graph.html"
conn = Surreal(args.db_url)
conn.signin({"username": args.username, "password": args.password})
conn.use(args.namespace, args.db_name)
top_rows = _top_concepts(conn, args.top)
if not top_rows:
raise SystemExit("No concepts found")
concept_ids: list[str] = []
concept_rids: list[Any] = []
freqs: list[float] = []
names: list[str] = []
for r in top_rows:
cid = r.get("id")
name = r.get("name")
freq = r.get("frequency")
if cid is None:
continue
cid_s = _as_str_id(cid)
concept_ids.append(cid_s)
concept_rids.append(cid)
names.append(str(name or cid_s.split(":", 1)[-1]).strip().strip('"'))
try:
freqs.append(float(freq) if freq is not None else 1.0)
except Exception:
freqs.append(1.0)
freq_min = min(freqs) if freqs else 1.0
freq_max = max(freqs) if freqs else 1.0
random.seed(args.seed)
nodes: list[dict[str, Any]] = []
for i, cid_s in enumerate(concept_ids):
angle = (2.0 * math.pi * i) / max(1, len(concept_ids))
x = math.cos(angle)
y = math.sin(angle)
size = _linear_scale(freqs[i], freq_min, freq_max, 5.0, 20.0)
nodes.append(
{
"id": cid_s,
"label": names[i],
"size": round(size, 3),
"x": round(x, 6),
"y": round(y, 6),
"color": "#2563eb",
"frequency": freqs[i],
}
)
mappings = _query_rows(
conn,
"""
SELECT in AS chunk, out AS concept
FROM MENTIONS_CONCEPT
WHERE out IN $concepts
""",
{"concepts": concept_rids},
)
id_set = set(concept_ids)
chunk_to_concepts: dict[str, set[str]] = {}
for row in mappings:
chunk = row.get("chunk")
concept = row.get("concept")
if chunk is None or concept is None:
continue
c_id = _as_str_id(concept)
if c_id not in id_set:
continue
ch_id = _as_str_id(chunk)
s = chunk_to_concepts.setdefault(ch_id, set())
if len(s) < args.max_concepts_per_chunk:
s.add(c_id)
edge_counts: dict[tuple[str, str], int] = {}
for concepts in chunk_to_concepts.values():
if len(concepts) < 2:
continue
sorted_ids = sorted(concepts)
for a, b in itertools.combinations(sorted_ids, 2):
edge_counts[(a, b)] = edge_counts.get((a, b), 0) + 1
edges_raw = [(a, b, w) for (a, b), w in edge_counts.items() if w >= args.min_weight]
edges_raw.sort(key=lambda t: t[2], reverse=True)
edges_raw = edges_raw[: max(0, int(args.max_edges))]
edges: list[dict[str, Any]] = []
for i, (a, b, w) in enumerate(edges_raw):
edges.append({"id": f"e{i}", "source": a, "target": b, "weight": int(w)})
payload = {
"meta": {
"db_url": args.db_url,
"namespace": args.namespace,
"db_name": args.db_name,
"top": args.top,
"min_weight": args.min_weight,
"max_edges": args.max_edges,
},
"nodes": nodes,
"edges": edges,
}
out_json.write_text(
json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
encoding="utf-8",
)
_write_html_viewer(out_html, payload, "Concept Graph (Top Concepts)")
print(str(out_json))
print(str(out_html))
return 0
if __name__ == "__main__":
raise SystemExit(main())