Spaces:

webmuppetnz
/

hmc-rag

Sleeping

hmc-rag / scripts /build_indexes.py

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 23 days ago

13 kB

	"""
	Build PageIndex trees for each domain compilation in the corpus.
	Uses local MLX server (Qwen 2.5 14B) for summary generation — zero API cost.

	Shows real-time progress for each LLM call.

	Usage:
	uv run python scripts/build_indexes.py # build all
	uv run python scripts/build_indexes.py legislation # build one domain
	uv run python scripts/build_indexes.py --list # show available domains
	uv run python scripts/build_indexes.py --dry-run # show costs without building
	"""

	import sys
	import os
	import json
	import time
	import argparse

	from dotenv import load_dotenv

	# Load .env from project root
	load_dotenv(os.path.join(os.path.dirname(__file__), "..", ".env"), override=True)

	# Add PageIndex repo to path
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "PageIndex"))

	import litellm

	from pageindex.page_index_md import (
	extract_nodes_from_markdown,
	extract_node_text_content,
	build_tree_from_nodes,
	)
	from pageindex.utils import (
	structure_to_list,
	write_node_id,
	format_structure,
	count_tokens,
	create_clean_structure_for_description,
	)

	# Model selection — supports local MLX and remote API models
	# ECE_MODEL takes priority (presets: qwen, gemma, sonnet, opus)
	# Falls back to ECE_MLX_MODEL for backwards compat
	_MODEL_PRESETS = {
	"qwen": "openai/mlx-community/Qwen2.5-14B-Instruct-4bit",
	"gemma": "openai/mlx-community/gemma-4-26b-a4b-it-4bit",
	"sonnet": "anthropic/claude-sonnet-4-6",
	"opus": "anthropic/claude-opus-4-6",
	}

	_model_choice = os.environ.get("ECE_MODEL") or os.environ.get("ECE_MLX_MODEL") or "qwen"
	INDEX_MODEL = _MODEL_PRESETS.get(_model_choice, _model_choice)
	LLM_TIMEOUT = 120

	# Only configure MLX server env vars for local models
	MLX_BASE_URL = "http://localhost:8080/v1"
	if INDEX_MODEL.startswith("openai/"):
	os.environ["OPENAI_API_KEY"] = "mlx"
	os.environ["OPENAI_API_BASE"] = MLX_BASE_URL

	CORPUS_DIR = os.path.join(os.path.dirname(__file__), "..", "corpus")
	INDEX_DIR = os.path.join(os.path.dirname(__file__), "..", "indexes")
	os.makedirs(INDEX_DIR, exist_ok=True)

	DOMAINS = [
	"medicines-and-supplements",
	"advertising-standards",
	"consumer-protection",
	"marketing-comms",
	"practitioner-regulation",
	"professional-codes",
	]


	def check_mlx_server():
	"""Pre-flight check: mlx-lm server is running."""
	import urllib.request
	import urllib.error
	try:
	req = urllib.request.Request(f"{MLX_BASE_URL}/models", method="GET")
	with urllib.request.urlopen(req, timeout=3) as resp:
	if resp.status == 200:
	return True
	except (urllib.error.URLError, OSError):
	pass
	print(f"\n ERROR: mlx-lm server not running at {MLX_BASE_URL}")
	print(f" Start it with:")
	print(f" uv run python -m mlx_lm server --model {MLX_MODEL} --port 8080\n")
	sys.exit(1)


	SUMMARY_SYSTEM_PROMPT = """\
	You write concise retrieval summaries for a NZ healthcare marketing compliance system.

	Your summaries will be compared against user search queries to decide which sections are relevant. \
	A good summary front-loads the specific topics, requirements, and terminology that someone would \
	search for. A bad summary is generic ("this section covers requirements for...") and could match anything.

	Rules:
	- 1-3 sentences, under 80 words
	- Start with the criterion code and topic, not "This section covers..."
	- Name specific requirements: age groups, ratios, qualifications, equipment types, time periods
	- Use the same terminology the regulation uses (e.g. "under-2s" not "infants", "kaiako" not "teachers")
	- Mention any secondary topics the section also covers
	- Do not include opinions or interpretation — just what the section contains"""


	def llm_call(model, prompt, system_prompt=None, timeout=LLM_TIMEOUT):
	"""Single LLM call via litellm."""
	messages = []
	if system_prompt:
	messages.append({"role": "system", "content": system_prompt})
	messages.append({"role": "user", "content": prompt})
	response = litellm.completion(
	model=model,
	messages=messages,
	temperature=0,
	timeout=timeout,
	max_tokens=500,
	)
	return (response.choices[0].message.content or "").strip()


	def generate_summary_sync(node, model):
	"""Generate a node summary. Short nodes skip the LLM entirely."""
	node_text = node.get("text", "")
	num_tokens = count_tokens(node_text, model=model)
	if num_tokens < 200:
	return node_text

	title = node.get("title", "")
	prompt = (
	f"Write a retrieval summary for this section of NZ ECE licensing criteria.\n\n"
	f"Section title: {title}\n\n"
	f"Full text:\n{node_text}\n\n"
	f"Summary:"
	)

	try:
	return llm_call(model, prompt, system_prompt=SUMMARY_SYSTEM_PROMPT)
	except Exception as e:
	return f"[summary error: {e}]"


	def generate_doc_description_sync(structure, model):
	"""Generate doc description."""
	clean_structure = create_clean_structure_for_description(structure)
	prompt = (
	"You are given the structure of a NZ Early Childhood Education regulatory document. "
	"Write a one-sentence description that distinguishes it from other ECE documents. "
	"Be specific about what domains, criteria codes, or legislation it covers.\n\n"
	f"Document Structure: {clean_structure}\n\n"
	"Description:"
	)

	try:
	return llm_call(model, prompt, system_prompt=SUMMARY_SYSTEM_PROMPT)
	except Exception as e:
	return f"[description error: {e}]"


	def build_tree_with_progress(domain):
	"""Build a PageIndex tree with per-step progress output."""
	md_path = os.path.join(CORPUS_DIR, f"{domain}.md")
	if not os.path.exists(md_path):
	print(f" SKIP: {md_path} not found")
	return None

	size_kb = os.path.getsize(md_path) / 1024
	with open(md_path, "r", encoding="utf-8") as f:
	content = f.read()
	line_count = content.count("\n") + 1
	print(f" Source: {size_kb:.0f} KB, {line_count} lines")

	# Step 1: Extract nodes (fast, no LLM)
	t0 = time.time()
	node_list, markdown_lines = extract_nodes_from_markdown(content)
	nodes_with_content = extract_node_text_content(node_list, markdown_lines)
	print(f" Extracting nodes... {len(nodes_with_content)} found ({time.time()-t0:.1f}s)")

	# Step 2: Build tree structure (fast, no LLM)
	tree_structure = build_tree_from_nodes(nodes_with_content)
	write_node_id(tree_structure)

	# Format with text included (needed for summary generation)
	tree_structure = format_structure(
	tree_structure,
	order=["title", "node_id", "line_num", "summary", "prefix_summary", "text", "nodes"],
	)

	# Step 3: Generate summaries one by one
	all_nodes = structure_to_list(tree_structure)
	total = len(all_nodes)
	llm_needed = sum(1 for n in all_nodes if count_tokens(n.get("text", ""), model=INDEX_MODEL) >= 200)
	print(f" Generating summaries ({total} nodes, {llm_needed} need LLM)...")

	t_sum = time.time()
	skipped = 0
	for i, node in enumerate(all_nodes, 1):
	t_node = time.time()
	summary = generate_summary_sync(node, INDEX_MODEL)

	if summary.startswith("["):
	skipped += 1
	print(f" {i}/{total} SKIP {node.get('title', '?')[:40]} — {summary}")
	else:
	elapsed_node = time.time() - t_node
	label = f"({elapsed_node:.1f}s)" if elapsed_node > 0.1 else "(skip)"
	print(f" {i}/{total} {label} {node.get('title', '?')[:50]}")

	if not node.get("nodes"):
	node["summary"] = summary
	else:
	node["prefix_summary"] = summary

	sum_elapsed = time.time() - t_sum
	print(f" Summaries done: {total - skipped}/{total} in {sum_elapsed:.0f}s")

	# Step 4: Generate doc description (1 LLM call)
	print(f" Generating doc description... ", end="", flush=True)
	t_desc = time.time()
	doc_description = generate_doc_description_sync(tree_structure, INDEX_MODEL)
	print(f"done ({time.time()-t_desc:.1f}s)")

	tree = {
	"doc_name": domain,
	"doc_description": doc_description,
	"line_count": line_count,
	"structure": tree_structure,
	}
	return tree


	def save_tree(domain, tree):
	"""Save tree to JSON file."""
	output_path = os.path.join(INDEX_DIR, f"{domain}.json")
	with open(output_path, "w", encoding="utf-8") as f:
	json.dump(tree, f, indent=2, ensure_ascii=False)
	size_kb = os.path.getsize(output_path) / 1024
	return output_path, size_kb


	def dry_run_report(targets):
	"""Show node counts and estimated LLM calls without calling the API."""
	print("=" * 60)
	print("Dry run — no API calls will be made")
	print(f"Index model: {INDEX_MODEL}")
	print("=" * 60)

	total_nodes = 0
	total_llm = 0

	for domain in targets:
	md_path = os.path.join(CORPUS_DIR, f"{domain}.md")
	if not os.path.exists(md_path):
	print(f"\n {domain}: corpus not found")
	continue

	size_kb = os.path.getsize(md_path) / 1024
	with open(md_path, "r", encoding="utf-8") as f:
	content = f.read()
	line_count = content.count("\n") + 1

	# Extract nodes (no LLM)
	node_list, markdown_lines = extract_nodes_from_markdown(content)
	nodes_with_content = extract_node_text_content(node_list, markdown_lines)

	# Count by heading level
	level_counts = {}
	for n in nodes_with_content:
	level = n.get("heading_level", 0)
	level_counts[level] = level_counts.get(level, 0) + 1

	# Count how many need LLM
	llm_needed = sum(
	1 for n in nodes_with_content
	if count_tokens(n.get("text", ""), model=INDEX_MODEL) >= 200
	)
	api_calls = llm_needed + 1 # +1 for doc description

	total_nodes += len(nodes_with_content)
	total_llm += api_calls

	# Check existing index
	idx_path = os.path.join(INDEX_DIR, f"{domain}.json")
	idx_status = ""
	if os.path.exists(idx_path):
	idx_size = os.path.getsize(idx_path) / 1024
	idx_status = f" (existing index: {idx_size:.0f} KB)"

	print(f"\n {domain}")
	print(f" Corpus: {size_kb:.0f} KB, {line_count} lines")
	levels_str = ", ".join(f"H{k}={v}" for k, v in sorted(level_counts.items()))
	print(f" Nodes: {len(nodes_with_content)} ({levels_str})")
	print(f" LLM calls: {llm_needed} summaries + 1 description = {api_calls}{idx_status}")

	print(f"\n{'─' * 60}")
	print(f" Total: {total_nodes} nodes, {total_llm} API calls")
	print(f"{'=' * 60}")


	def main():
	parser = argparse.ArgumentParser(description="Build PageIndex trees")
	parser.add_argument("domains", nargs="*", help="Specific domains to build (default: all)")
	parser.add_argument("--list", action="store_true", help="List available domains")
	parser.add_argument("--dry-run", action="store_true",
	help="Show node counts and estimated API calls without building")
	args = parser.parse_args()

	if args.list:
	print("Available domains:")
	for d in DOMAINS:
	path = os.path.join(CORPUS_DIR, f"{d}.md")
	exists = os.path.exists(path)
	size = f"{os.path.getsize(path)/1024:.0f} KB" if exists else "not found"
	print(f" {d:25s} {size}")
	return

	# Validate domain names
	targets = args.domains if args.domains else DOMAINS
	for d in targets:
	if d not in DOMAINS:
	print(f"Unknown domain: {d}")
	print(f"Available: {', '.join(DOMAINS)}")
	sys.exit(1)

	if args.dry_run:
	dry_run_report(targets)
	return

	print("=" * 60)
	print("Building PageIndex trees")
	print(f"Index model: {INDEX_MODEL}")
	print(f"Domains: {', '.join(targets)}")
	print("=" * 60)

	# Pre-flight check — only needed for local MLX models
	if INDEX_MODEL.startswith("openai/"):
	print("\nChecking mlx-lm server... ", end="", flush=True)
	check_mlx_server()
	print("OK")
	else:
	print(f"\nUsing API model: {INDEX_MODEL}")

	total_time = time.time()

	for i, domain in enumerate(targets, 1):
	print(f"\n[{i}/{len(targets)}] {domain}")
	print(f" {'─' * 40}")

	start_time = time.time()
	tree = build_tree_with_progress(domain)

	if tree is None:
	continue

	# Save
	output_path, size_kb = save_tree(domain, tree)
	elapsed = time.time() - start_time

	print(f" Saved: {output_path} ({size_kb:.0f} KB)")
	print(f" Total: {elapsed:.0f}s")

	print(f"\n{'=' * 60}")
	print(f"Done in {time.time()-total_time:.0f}s")
	print(f"{'=' * 60}")


	if __name__ == "__main__":
	main()