Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Export a concept co-occurrence graph from SurrealDB. | |
| Outputs (default under ./exports): | |
| - concept_graph.json: Sigma-friendly graph JSON | |
| - concept_graph.html: self-contained HTML viewer (vis-network inlined) | |
| Edges are co-occurrence within the same chunk: | |
| If a chunk mentions concepts A and B, edge(A,B) += 1. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import itertools | |
| import json | |
| import math | |
| import os | |
| import random | |
| import urllib.request | |
| from pathlib import Path | |
| from typing import Any | |
| from surrealdb import Surreal | |
| def _query_rows( | |
| conn: Surreal, surql: str, vars: dict[str, Any] | None = None | |
| ) -> list[dict[str, Any]]: | |
| res = conn.query(surql, vars or {}) | |
| if not isinstance(res, list): | |
| return [] | |
| return [r for r in res if isinstance(r, dict)] | |
| def _as_str_id(value: Any) -> str: | |
| return str(value) | |
| def _linear_scale( | |
| value: float, vmin: float, vmax: float, out_min: float, out_max: float | |
| ) -> float: | |
| if vmax <= vmin: | |
| return out_min | |
| t = (value - vmin) / (vmax - vmin) | |
| t = max(0.0, min(1.0, t)) | |
| return out_min + t * (out_max - out_min) | |
| def _fetch_text(url: str, timeout_s: int = 20) -> str: | |
| with urllib.request.urlopen(url, timeout=timeout_s) as resp: | |
| return resp.read().decode("utf-8", errors="replace") | |
| def _write_html_viewer(output_html: Path, payload: dict[str, Any], title: str) -> None: | |
| """Write a self-contained HTML viewer using vis-network. | |
| We download the JS/CSS once at export time and inline it to avoid relying on | |
| runtime CDN access. | |
| """ | |
| css_url = ( | |
| "https://cdn.jsdelivr.net/npm/vis-network@9.1.9/styles/vis-network.min.css" | |
| ) | |
| js_url = "https://cdn.jsdelivr.net/npm/vis-network@9.1.9/standalone/umd/vis-network.min.js" | |
| try: | |
| vis_css = _fetch_text(css_url) | |
| vis_js = _fetch_text(js_url) | |
| except Exception as exc: | |
| output_html.write_text( | |
| "\n".join( | |
| [ | |
| "<!doctype html>", | |
| '<meta charset="utf-8" />', | |
| f"<title>{title}</title>", | |
| "<pre>", | |
| "Failed to download vis-network assets while generating HTML.", | |
| f"Error: {exc}", | |
| "</pre>", | |
| ] | |
| ), | |
| encoding="utf-8", | |
| ) | |
| return | |
| data_json = json.dumps(payload, ensure_ascii=False) | |
| # f-string + braces => double braces for literal '{' in JS/CSS. | |
| html = f"""<!doctype html> | |
| <html> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <title>{title}</title> | |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> | |
| <style> | |
| html, body {{ height: 100%; margin: 0; overflow: hidden; font-family: system-ui, -apple-system, Segoe UI, sans-serif; background:#0b1220; color:#e5e7eb; }} | |
| #app {{ position: fixed; inset: 0; display: flex; flex-direction: column; }} | |
| header {{ flex: 0 0 auto; padding: 10px 12px; border-bottom: 1px solid rgba(148,163,184,.25); display:flex; gap:12px; align-items:baseline; }} | |
| header h1 {{ font-size: 14px; margin: 0; font-weight: 650; letter-spacing: .2px; }} | |
| header .meta {{ color: rgba(226,232,240,.75); font-size: 12px; }} | |
| #network {{ flex: 1 1 auto; min-height: 0; width: 100%; }} | |
| #error {{ position: fixed; left: 0; right: 0; bottom: 0; max-height: 40%; overflow: auto; padding: 10px 12px; white-space: pre-wrap; color: #fecaca; background: rgba(15, 23, 42, .85); border-top: 1px solid rgba(148,163,184,.25); }} | |
| </style> | |
| <style> | |
| {vis_css} | |
| </style> | |
| </head> | |
| <body> | |
| <div id="app"> | |
| <header> | |
| <h1>{title}</h1> | |
| <div class="meta">drag nodes, scroll to zoom, click node to focus</div> | |
| </header> | |
| <div id="network"></div> | |
| </div> | |
| <div id="error"></div> | |
| <script> | |
| {vis_js} | |
| </script> | |
| <script> | |
| const payload = {data_json}; | |
| const elErr = document.getElementById('error'); | |
| try {{ | |
| const nodes = payload.nodes.map(n => ({{ | |
| id: n.id, | |
| label: n.label, | |
| value: Math.max(1, n.size || 1), | |
| title: (n.label + "\\nfrequency: " + (n.frequency ?? '')), | |
| color: {{ background: '#2563eb', border: '#93c5fd' }}, | |
| font: {{ color: '#e5e7eb' }}, | |
| }})); | |
| const edges = payload.edges.map(e => ({{ | |
| id: e.id, | |
| from: e.source, | |
| to: e.target, | |
| value: e.weight || 1, | |
| title: ("weight: " + (e.weight || 1)), | |
| color: {{ color: 'rgba(148,163,184,.55)' }}, | |
| smooth: {{ type: 'dynamic' }}, | |
| }})); | |
| const container = document.getElementById('network'); | |
| const data = {{ nodes: new vis.DataSet(nodes), edges: new vis.DataSet(edges) }}; | |
| const options = {{ | |
| nodes: {{ shape: 'dot', scaling: {{ min: 6, max: 28 }} }}, | |
| edges: {{ scaling: {{ min: 0.2, max: 4 }} }}, | |
| interaction: {{ hover: true, tooltipDelay: 120 }}, | |
| physics: {{ stabilization: {{ iterations: 200, fit: true }}, barnesHut: {{ gravitationalConstant: -12000, springLength: 110 }} }}, | |
| }}; | |
| const network = new vis.Network(container, data, options); | |
| network.once('stabilizationIterationsDone', () => {{ | |
| try {{ network.fit({{ animation: false }}); }} catch (e) {{}} | |
| }}); | |
| network.on('click', params => {{ | |
| if (params.nodes.length) network.focus(params.nodes[0], {{ scale: 1.2, animation: {{ duration: 250 }} }}); | |
| }}); | |
| }} catch (e) {{ | |
| elErr.textContent = String(e && e.stack ? e.stack : e); | |
| }} | |
| </script> | |
| </body> | |
| </html> | |
| """ | |
| output_html.write_text(html, encoding="utf-8") | |
| def _top_concepts(conn: Surreal, top: int) -> list[dict[str, Any]]: | |
| # Prefer derived frequency from edge table; concept table doesn't reliably | |
| # have a frequency field in this dataset. | |
| return _query_rows( | |
| conn, | |
| """ | |
| SELECT out, out.id AS id, out.content AS name, count() AS frequency | |
| FROM MENTIONS_CONCEPT | |
| GROUP BY out | |
| ORDER BY frequency DESC | |
| LIMIT $top | |
| """, | |
| {"top": top}, | |
| ) | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Export concept graph from SurrealDB") | |
| parser.add_argument("--top", type=int, default=50, help="Top concepts to include") | |
| parser.add_argument("--min-weight", type=int, default=1, help="Minimum edge weight") | |
| parser.add_argument( | |
| "--max-edges", type=int, default=500, help="Maximum edges output" | |
| ) | |
| parser.add_argument( | |
| "--max-concepts-per-chunk", | |
| type=int, | |
| default=25, | |
| help="Cap concepts per chunk for co-occurrence", | |
| ) | |
| parser.add_argument("--output-dir", default="exports", help="Output directory") | |
| parser.add_argument( | |
| "--db-url", | |
| default=os.getenv("KG_DB_URL", "ws://localhost:8000/rpc"), | |
| help="SurrealDB WS URL", | |
| ) | |
| parser.add_argument( | |
| "--db-name", | |
| default=os.getenv("DB_NAME", "test_db"), | |
| help="SurrealDB database name", | |
| ) | |
| parser.add_argument( | |
| "--namespace", | |
| default=os.getenv("DB_NS", "kaig"), | |
| help="SurrealDB namespace", | |
| ) | |
| parser.add_argument("--username", default=os.getenv("DB_USER", "root")) | |
| parser.add_argument("--password", default=os.getenv("DB_PASS", "root")) | |
| parser.add_argument("--seed", type=int, default=7, help="Layout seed") | |
| args = parser.parse_args() | |
| out_dir = Path(args.output_dir) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| out_json = out_dir / "concept_graph.json" | |
| out_html = out_dir / "concept_graph.html" | |
| conn = Surreal(args.db_url) | |
| conn.signin({"username": args.username, "password": args.password}) | |
| conn.use(args.namespace, args.db_name) | |
| top_rows = _top_concepts(conn, args.top) | |
| if not top_rows: | |
| raise SystemExit("No concepts found") | |
| concept_ids: list[str] = [] | |
| concept_rids: list[Any] = [] | |
| freqs: list[float] = [] | |
| names: list[str] = [] | |
| for r in top_rows: | |
| cid = r.get("id") | |
| name = r.get("name") | |
| freq = r.get("frequency") | |
| if cid is None: | |
| continue | |
| cid_s = _as_str_id(cid) | |
| concept_ids.append(cid_s) | |
| concept_rids.append(cid) | |
| names.append(str(name or cid_s.split(":", 1)[-1]).strip().strip('"')) | |
| try: | |
| freqs.append(float(freq) if freq is not None else 1.0) | |
| except Exception: | |
| freqs.append(1.0) | |
| freq_min = min(freqs) if freqs else 1.0 | |
| freq_max = max(freqs) if freqs else 1.0 | |
| random.seed(args.seed) | |
| nodes: list[dict[str, Any]] = [] | |
| for i, cid_s in enumerate(concept_ids): | |
| angle = (2.0 * math.pi * i) / max(1, len(concept_ids)) | |
| x = math.cos(angle) | |
| y = math.sin(angle) | |
| size = _linear_scale(freqs[i], freq_min, freq_max, 5.0, 20.0) | |
| nodes.append( | |
| { | |
| "id": cid_s, | |
| "label": names[i], | |
| "size": round(size, 3), | |
| "x": round(x, 6), | |
| "y": round(y, 6), | |
| "color": "#2563eb", | |
| "frequency": freqs[i], | |
| } | |
| ) | |
| mappings = _query_rows( | |
| conn, | |
| """ | |
| SELECT in AS chunk, out AS concept | |
| FROM MENTIONS_CONCEPT | |
| WHERE out IN $concepts | |
| """, | |
| {"concepts": concept_rids}, | |
| ) | |
| id_set = set(concept_ids) | |
| chunk_to_concepts: dict[str, set[str]] = {} | |
| for row in mappings: | |
| chunk = row.get("chunk") | |
| concept = row.get("concept") | |
| if chunk is None or concept is None: | |
| continue | |
| c_id = _as_str_id(concept) | |
| if c_id not in id_set: | |
| continue | |
| ch_id = _as_str_id(chunk) | |
| s = chunk_to_concepts.setdefault(ch_id, set()) | |
| if len(s) < args.max_concepts_per_chunk: | |
| s.add(c_id) | |
| edge_counts: dict[tuple[str, str], int] = {} | |
| for concepts in chunk_to_concepts.values(): | |
| if len(concepts) < 2: | |
| continue | |
| sorted_ids = sorted(concepts) | |
| for a, b in itertools.combinations(sorted_ids, 2): | |
| edge_counts[(a, b)] = edge_counts.get((a, b), 0) + 1 | |
| edges_raw = [(a, b, w) for (a, b), w in edge_counts.items() if w >= args.min_weight] | |
| edges_raw.sort(key=lambda t: t[2], reverse=True) | |
| edges_raw = edges_raw[: max(0, int(args.max_edges))] | |
| edges: list[dict[str, Any]] = [] | |
| for i, (a, b, w) in enumerate(edges_raw): | |
| edges.append({"id": f"e{i}", "source": a, "target": b, "weight": int(w)}) | |
| payload = { | |
| "meta": { | |
| "db_url": args.db_url, | |
| "namespace": args.namespace, | |
| "db_name": args.db_name, | |
| "top": args.top, | |
| "min_weight": args.min_weight, | |
| "max_edges": args.max_edges, | |
| }, | |
| "nodes": nodes, | |
| "edges": edges, | |
| } | |
| out_json.write_text( | |
| json.dumps(payload, ensure_ascii=False, indent=2) + "\n", | |
| encoding="utf-8", | |
| ) | |
| _write_html_viewer(out_html, payload, "Concept Graph (Top Concepts)") | |
| print(str(out_json)) | |
| print(str(out_html)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |