Spaces:

mr4
/

knowledge-graph-preview

Running

App Files Files Community

knowledge-graph-preview / assets /skills /understand-knowledge /parse-knowledge-base.py

mr4

Upload 136 files

fd8cdf5 verified 18 days ago

raw

history blame contribute delete

18.9 kB

	#!/usr/bin/env python3
	"""
	Deterministic parser for Karpathy-pattern LLM wikis.

	Detects the three-layer pattern (raw sources + wiki markdown + schema),
	extracts structure from markdown files, resolves wikilinks, and derives
	categories from index.md section headings.

	Usage:
	python parse-knowledge-base.py <wiki-directory>

	Output:
	Writes scan-manifest.json to <wiki-directory>/.understand-anything/intermediate/
	"""

	import json
	import os
	import re
	import sys
	from pathlib import Path

	# ---------------------------------------------------------------------------
	# Regex patterns
	# ---------------------------------------------------------------------------
	WIKILINK_RE = re.compile(r"\[\[([^\]\|]+)(?:\\|([^\]]+))?\]\]")
	FRONTMATTER_RE = re.compile(r"^---\s\n(.?)\n---\s*\n", re.DOTALL)
	CODE_BLOCK_RE = re.compile(r"```(\w*)")
	HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
	INDEX_SECTION_RE = re.compile(r"^##\s+(.+)$", re.MULTILINE)

	# Files that are part of wiki infrastructure, not content articles
	INFRA_FILES = {"index.md", "log.md", "claude.md", "agents.md", "soul.md"}

	# ---------------------------------------------------------------------------
	# Detection: is this a Karpathy-pattern wiki?
	# ---------------------------------------------------------------------------

	def detect_format(root: Path) -> dict:
	"""Detect if directory follows the Karpathy LLM wiki three-layer pattern."""
	signals = {
	"has_index": (root / "index.md").is_file() or (root / "wiki" / "index.md").is_file(),
	"has_log": (root / "log.md").is_file() or (root / "wiki" / "log.md").is_file(),
	"has_raw": (root / "raw").is_dir(),
	"has_schema": any(
	(root / f).is_file() or (root / "wiki" / f).is_file()
	for f in ["CLAUDE.md", "AGENTS.md"]
	),
	}

	# Find the wiki root — could be the directory itself or a wiki/ subdirectory
	if (root / "wiki").is_dir():
	wiki_root = root / "wiki"
	else:
	wiki_root = root

	# Count markdown files in the wiki root
	md_files = list(wiki_root.rglob("*.md"))
	signals["md_count"] = len(md_files)
	signals["wiki_root"] = str(wiki_root)

	# Primary signal: has index.md + meaningful number of markdown files
	if signals["has_index"] and signals["md_count"] >= 3:
	signals["detected"] = True
	signals["format"] = "karpathy"
	else:
	signals["detected"] = False
	signals["format"] = "unknown"

	return signals


	# ---------------------------------------------------------------------------
	# Markdown extraction helpers
	# ---------------------------------------------------------------------------

	def extract_frontmatter(text: str) -> dict:
	"""Extract YAML frontmatter as a simple key-value dict."""
	m = FRONTMATTER_RE.match(text)
	if not m:
	return {}
	fm = {}
	for line in m.group(1).split("\n"):
	if ":" in line:
	key, _, val = line.partition(":")
	fm[key.strip()] = val.strip().strip('"').strip("'")
	return fm


	def extract_wikilinks(text: str) -> list[dict]:
	"""Extract all [[target]] and [[target\|display]] wikilinks."""
	links = []
	for m in WIKILINK_RE.finditer(text):
	links.append({
	"target": m.group(1).strip(),
	"display": m.group(2).strip() if m.group(2) else None,
	})
	return links


	def extract_headings(text: str) -> list[dict]:
	"""Extract all markdown headings with level and text."""
	return [
	{"level": len(m.group(1)), "text": m.group(2).strip()}
	for m in HEADING_RE.finditer(text)
	]


	def extract_code_blocks(text: str) -> list[str]:
	"""Extract languages from fenced code blocks."""
	return [m.group(1) for m in CODE_BLOCK_RE.finditer(text) if m.group(1)]


	def extract_first_paragraph(text: str) -> str:
	"""Extract the first non-empty paragraph after frontmatter and H1."""
	# Strip frontmatter
	stripped = FRONTMATTER_RE.sub("", text).strip()
	if not stripped:
	return ""
	lines = stripped.split("\n")

	def _collect_paragraph(start_lines: list[str]) -> str:
	"""Collect the first paragraph from the given lines."""
	para: list[str] = []
	for s_raw in start_lines:
	s = s_raw.strip()
	if not s and not para:
	continue # Skip leading blank lines
	if not s and para:
	break # End of paragraph
	if s.startswith(">"):
	continue # Skip blockquotes
	if re.match(r"^[-_]{3,}\s$", s):
	continue # Skip horizontal rules
	if s.startswith("#"):
	if para:
	break # End paragraph at next heading
	continue # Skip headings before paragraph
	para.append(s)
	return " ".join(para)

	# Try: find first paragraph after H1
	for i, line in enumerate(lines):
	if line.strip().startswith("# "):
	result = _collect_paragraph(lines[i + 1:])
	if result:
	if len(result) > 200:
	return result[:197] + "..."
	return result

	# Fallback: no H1 found, take first paragraph from start
	result = _collect_paragraph(lines)
	if len(result) > 200:
	result = result[:197] + "..."
	return result or ""


	def extract_h1(text: str) -> str:
	"""Extract the first H1 heading."""
	for m in HEADING_RE.finditer(text):
	if len(m.group(1)) == 1:
	# Strip trailing wiki-style decorations like " — subtitle"
	return m.group(2).strip()
	return ""


	# ---------------------------------------------------------------------------
	# Index.md parsing — categories come from section headings
	# ---------------------------------------------------------------------------

	def parse_index(index_path: Path) -> list[dict]:
	"""Parse index.md to extract categories from ## headings and their wikilinks."""
	if not index_path.is_file():
	return []
	text = index_path.read_text(encoding="utf-8", errors="replace")
	categories = []
	current_category = None

	for line in text.split("\n"):
	# Detect ## section heading
	sec_match = re.match(r"^##\s+(.+)$", line)
	if sec_match:
	current_category = {
	"name": sec_match.group(1).strip(),
	"articles": [],
	}
	categories.append(current_category)
	continue

	# Collect wikilinks under current section
	if current_category:
	for wl in WIKILINK_RE.finditer(line):
	current_category["articles"].append(wl.group(1).strip())

	return categories


	# ---------------------------------------------------------------------------
	# Log.md parsing — extract operation timeline
	# ---------------------------------------------------------------------------

	def parse_log(log_path: Path) -> list[dict]:
	"""Parse log.md to extract chronological entries."""
	if not log_path.is_file():
	return []
	text = log_path.read_text(encoding="utf-8", errors="replace")
	entries = []
	log_entry_re = re.compile(
	r"^##\s+\[(\d{4}-\d{2}-\d{2})\]\s+(\w+)\s\\|\s(.+)$", re.MULTILINE
	)
	for m in log_entry_re.finditer(text):
	entries.append({
	"date": m.group(1),
	"operation": m.group(2),
	"title": m.group(3).strip(),
	})
	return entries


	# ---------------------------------------------------------------------------
	# Main pipeline
	# ---------------------------------------------------------------------------

	def build_name_to_stem_map(wiki_root: Path) -> dict[str, str]:
	"""Build a case-insensitive map from filename stem to relative stem path.

	Full relative paths always map uniquely. Bare basenames map only when
	unambiguous — duplicate basenames are removed so they don't silently
	resolve to the wrong page.
	"""
	name_map: dict[str, str] = {}
	# Track which bare basenames appear more than once
	basename_counts: dict[str, int] = {}
	for md_file in wiki_root.rglob("*.md"):
	rel = md_file.relative_to(wiki_root)
	stem = str(rel.with_suffix("")) # e.g., "decisions/decision-foo"
	basename = md_file.stem # e.g., "decision-foo"
	# Full relative path always maps uniquely
	name_map[stem.lower()] = stem
	# Track basename for ambiguity detection
	key = basename.lower()
	basename_counts[key] = basename_counts.get(key, 0) + 1
	name_map[key] = stem

	# Remove ambiguous basename entries (appear more than once)
	for key, count in basename_counts.items():
	if count > 1 and key in name_map:
	del name_map[key]

	return name_map


	def resolve_wikilink(target: str, name_map: dict[str, str], node_ids: set[str] \| None = None) -> str \| None:
	"""Resolve a wikilink target to an article node ID.

	If node_ids is provided, only resolve to IDs that exist in the set.
	"""
	key = target.lower().strip()
	# Skip targets that are clearly not page names (shell flags, etc.)
	if key.startswith("-"):
	return None
	stem = name_map.get(key)
	if stem:
	candidate = f"article:{stem}"
	# If we have a node set, verify the target exists
	if node_ids is not None and candidate not in node_ids:
	return None
	return candidate
	# Try without directory prefix
	for stored_key, stored_stem in name_map.items():
	if stored_key.endswith("/" + key) or stored_key == key:
	candidate = f"article:{stored_stem}"
	if node_ids is not None and candidate not in node_ids:
	return None
	return candidate
	return None


	def parse_wiki(root: Path) -> dict:
	"""Parse a Karpathy-pattern wiki and produce the scan manifest."""
	detection = detect_format(root)
	if not detection["detected"]:
	print(json.dumps({"error": "Not a Karpathy-pattern wiki", "detection": detection}),
	file=sys.stderr)
	sys.exit(1)

	wiki_root = Path(detection["wiki_root"])
	raw_root = root / "raw"

	# Build name resolution map
	name_map = build_name_to_stem_map(wiki_root)

	# Find index.md and log.md
	index_path = wiki_root / "index.md"
	if not index_path.is_file():
	index_path = root / "index.md"
	log_path = wiki_root / "log.md"
	if not log_path.is_file():
	log_path = root / "log.md"

	# Parse index for categories
	categories = parse_index(index_path)
	log_entries = parse_log(log_path)

	# Build category lookup: wikilink target → category name
	category_lookup: dict[str, str] = {}
	for cat in categories:
	for article_target in cat["articles"]:
	category_lookup[article_target.lower()] = cat["name"]

	# --- Pre-compute article IDs (for edge resolution validation) ---
	# Only skip infra files at the wiki root level, not in subdirectories
	# (e.g., wiki/index.md is infra, but wiki/concepts/index.md is content)
	article_ids: set[str] = set()
	for md_file in sorted(wiki_root.rglob("*.md")):
	rel = md_file.relative_to(wiki_root)
	stem = str(rel.with_suffix(""))
	# Only filter infra files at root level (no parent directory)
	if rel.parent == Path(".") and rel.name.lower() in INFRA_FILES:
	continue
	article_ids.add(f"article:{stem}")

	# --- Build article nodes ---
	nodes = []
	edges = []
	warnings = []
	stats = {"articles": 0, "sources": 0, "topics": 0, "wikilinks": 0, "unresolved": 0}

	for md_file in sorted(wiki_root.rglob("*.md")):
	rel = md_file.relative_to(wiki_root)
	stem = str(rel.with_suffix(""))
	basename = md_file.stem

	# Skip infrastructure files only at wiki root level
	if rel.parent == Path(".") and rel.name.lower() in INFRA_FILES:
	continue

	text = md_file.read_text(encoding="utf-8", errors="replace")
	h1 = extract_h1(text)
	frontmatter = extract_frontmatter(text)
	wikilinks = extract_wikilinks(text)
	headings = extract_headings(text)
	code_langs = extract_code_blocks(text)
	summary = extract_first_paragraph(text)
	line_count = text.count("\n") + 1
	word_count = len(text.split())

	# Derive category from index.md lookup
	category = category_lookup.get(basename.lower(), "")
	if not category:
	# Try stem match
	category = category_lookup.get(stem.lower(), "")

	# Derive tags (deduplicated)
	tag_set: set[str] = set()
	if category:
	tag_set.add(category.lower())
	if rel.parent != Path("."):
	tag_set.add(str(rel.parent))
	fm_tags = frontmatter.get("tags", "")
	if fm_tags:
	tag_set.update(t.strip() for t in fm_tags.split(",") if t.strip())
	tags = sorted(tag_set)

	# Complexity from wikilink density
	wl_count = len(wikilinks)
	if wl_count > 15:
	complexity = "complex"
	elif wl_count > 5:
	complexity = "moderate"
	else:
	complexity = "simple"

	node_id = f"article:{stem}"
	nodes.append({
	"id": node_id,
	"type": "article",
	"name": h1 or basename,
	"filePath": str(rel),
	"summary": summary or f"Wiki article: {h1 or basename}",
	"tags": tags,
	"complexity": complexity,
	"knowledgeMeta": {
	"wikilinks": [wl["target"] for wl in wikilinks],
	"category": category or None,
	"content": text[:3000], # First 3000 chars for LLM analysis
	},
	})
	stats["articles"] += 1
	stats["wikilinks"] += wl_count

	# Build edges from wikilinks (resolve against known article IDs)
	for wl in wikilinks:
	target_id = resolve_wikilink(wl["target"], name_map, article_ids)
	if target_id and target_id != node_id:
	edges.append({
	"source": node_id,
	"target": target_id,
	"type": "related",
	"direction": "forward",
	"weight": 0.7,
	})
	elif not target_id:
	warnings.append(f"Unresolved wikilink: [[{wl['target']}]] in {rel}")
	stats["unresolved"] += 1

	# --- Build topic nodes from index.md categories ---
	for cat in categories:
	topic_id = f"topic:{cat['name'].lower().replace(' ', '-')}"
	nodes.append({
	"id": topic_id,
	"type": "topic",
	"name": cat["name"],
	"summary": f"Category from index: {cat['name']} ({len(cat['articles'])} articles)",
	"tags": ["category"],
	"complexity": "simple",
	})
	stats["topics"] += 1

	# categorized_under edges (only resolve to known article nodes)
	for article_target in cat["articles"]:
	article_id = resolve_wikilink(article_target, name_map, article_ids)
	if article_id:
	edges.append({
	"source": article_id,
	"target": topic_id,
	"type": "categorized_under",
	"direction": "forward",
	"weight": 0.6,
	})

	# --- Build source nodes from raw/ ---
	if raw_root.is_dir():
	for raw_file in sorted(raw_root.rglob("*")):
	if raw_file.is_file() and not raw_file.name.startswith("."):
	rel_raw = raw_file.relative_to(root)
	ext = raw_file.suffix.lower()
	size_kb = raw_file.stat().st_size / 1024
	source_id = f"source:{raw_file.relative_to(raw_root).with_suffix('')}"
	nodes.append({
	"id": source_id,
	"type": "source",
	"name": raw_file.name,
	"filePath": str(rel_raw),
	"summary": f"Raw source ({ext or 'unknown'}, {size_kb:.0f} KB)",
	"tags": ["raw", ext.lstrip(".") or "unknown"],
	"complexity": "simple",
	})
	stats["sources"] += 1

	# --- Compute backlinks ---
	backlink_map: dict[str, list[str]] = {}
	for edge in edges:
	if edge["type"] == "related":
	target = edge["target"]
	source = edge["source"]
	backlink_map.setdefault(target, []).append(source)
	for node in nodes:
	if node["type"] == "article" and "knowledgeMeta" in node:
	bl = backlink_map.get(node["id"], [])
	node["knowledgeMeta"]["backlinks"] = bl

	# --- Deduplicate edges ---
	seen_edges: set[tuple[str, str, str]] = set()
	deduped_edges = []
	for edge in edges:
	key = (edge["source"], edge["target"], edge["type"])
	if key not in seen_edges:
	seen_edges.add(key)
	deduped_edges.append(edge)

	return {
	"format": "karpathy",
	"stats": stats,
	"categories": [{"name": c["name"], "count": len(c["articles"])} for c in categories],
	"logEntries": len(log_entries),
	"nodes": nodes,
	"edges": deduped_edges,
	"warnings": warnings[:50], # Cap warnings
	}


	def main():
	if len(sys.argv) < 2:
	print("Usage: parse-knowledge-base.py <wiki-directory>", file=sys.stderr)
	sys.exit(1)

	root = Path(sys.argv[1]).resolve()
	if not root.is_dir():
	print(f"Error: {root} is not a directory", file=sys.stderr)
	sys.exit(1)

	manifest = parse_wiki(root)

	# Write output
	out_dir = root / ".understand-anything" / "intermediate"
	out_dir.mkdir(parents=True, exist_ok=True)
	out_path = out_dir / "scan-manifest.json"
	out_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

	# Report to stderr
	s = manifest["stats"]
	print(f"[parse] Karpathy wiki: {s['articles']} articles, {s['sources']} sources, "
	f"{s['topics']} topics, {s['wikilinks']} wikilinks "
	f"({s['unresolved']} unresolved)", file=sys.stderr)
	print(f"[parse] Output: {out_path}", file=sys.stderr)


	if __name__ == "__main__":
	main()