Spaces:

webmuppetnz
/

hmc-rag

Running

webmuppet

Initial commit — health marketing compliance RAG

bad8b6c 5 days ago

5.74 kB

	"""
	Health check for ECE Compliance RAG system.

	Checks Ollama connectivity, model availability, corpus files, and index freshness.

	Usage:
	uv run python scripts/check.py # full check
	uv run python scripts/check.py --ollama # just Ollama check
	"""

	import os
	import sys
	import json
	import argparse
	import urllib.request
	import urllib.error

	PROJECT_ROOT = os.path.join(os.path.dirname(__file__), "..")
	CORPUS_DIR = os.path.join(PROJECT_ROOT, "corpus")
	INDEX_DIR = os.path.join(PROJECT_ROOT, "indexes")
	OLLAMA_URL = "http://localhost:11434"
	REQUIRED_MODEL = "qwen2.5:14b"

	DOMAINS = {
	"legislation": "legislation",
	"licensing-criteria": "licensing-criteria",
	"ero": "ero",
	"cross-regulator": "cross-regulator",
	"reform-context": "reform-context",
	}

	OK = "\033[32m✓\033[0m"
	FAIL = "\033[31m✗\033[0m"
	WARN = "\033[33m!\033[0m"


	def check_ollama():
	"""Check Ollama is running and model is loaded."""
	# 1. Is Ollama reachable?
	try:
	req = urllib.request.Request(f"{OLLAMA_URL}/api/tags")
	with urllib.request.urlopen(req, timeout=5) as resp:
	data = json.loads(resp.read())
	except (urllib.error.URLError, ConnectionRefusedError, OSError):
	print(f" {FAIL} Ollama not reachable at {OLLAMA_URL}")
	print(f" Run: ollama serve")
	return False

	print(f" {OK} Ollama running at {OLLAMA_URL}")

	# 2. Is the model available?
	models = [m.get("name", "") for m in data.get("models", [])]
	model_found = any(REQUIRED_MODEL in m for m in models)

	if model_found:
	print(f" {OK} Model {REQUIRED_MODEL} available")
	else:
	print(f" {FAIL} Model {REQUIRED_MODEL} not found")
	print(f" Available: {', '.join(models) or 'none'}")
	print(f" Run: ollama pull {REQUIRED_MODEL}")
	return False

	return True


	def check_corpus():
	"""Check corpus files exist and show sizes."""
	all_ok = True
	for name, slug in DOMAINS.items():
	path = os.path.join(CORPUS_DIR, f"{slug}.md")
	if os.path.exists(path):
	size_kb = os.path.getsize(path) / 1024
	with open(path, "r") as f:
	lines = sum(1 for _ in f)
	marker = OK if size_kb > 1 else WARN
	print(f" {marker} {name:20s} {size_kb:6.0f} KB {lines:5d} lines")
	else:
	print(f" {FAIL} {name:20s} NOT FOUND")
	all_ok = False
	return all_ok


	def check_indexes():
	"""Check index files exist, show sizes, and flag stale ones."""
	all_ok = True
	for name, slug in DOMAINS.items():
	idx_path = os.path.join(INDEX_DIR, f"{slug}.json")
	corpus_path = os.path.join(CORPUS_DIR, f"{slug}.md")

	if not os.path.exists(idx_path):
	print(f" {FAIL} {name:20s} NOT FOUND")
	all_ok = False
	continue

	size_kb = os.path.getsize(idx_path) / 1024

	# Check if index is older than corpus (stale)
	stale = False
	if os.path.exists(corpus_path):
	idx_mtime = os.path.getmtime(idx_path)
	corpus_mtime = os.path.getmtime(corpus_path)
	if idx_mtime < corpus_mtime:
	stale = True

	# Check if index has doc_description
	try:
	with open(idx_path) as f:
	tree = json.load(f)
	has_desc = bool(tree.get("doc_description"))
	node_count = len(_count_nodes(tree.get("structure", [])))
	except Exception:
	has_desc = False
	node_count = 0

	if stale:
	marker = WARN
	suffix = " STALE (corpus is newer)"
	elif not has_desc:
	marker = WARN
	suffix = " (no doc_description)"
	else:
	marker = OK
	suffix = ""

	print(f" {marker} {name:20s} {size_kb:6.0f} KB {node_count:3d} nodes{suffix}")

	if stale:
	all_ok = False

	return all_ok


	def _count_nodes(structure):
	"""Count all nodes in a PageIndex tree structure."""
	nodes = []
	for item in structure:
	nodes.append(item)
	if "nodes" in item and item["nodes"]:
	nodes.extend(_count_nodes(item["nodes"]))
	return nodes


	def check_pageindex():
	"""Check PageIndex repo is cloned."""
	pi_dir = os.path.join(PROJECT_ROOT, "PageIndex")
	pi_module = os.path.join(pi_dir, "pageindex", "page_index_md.py")
	if os.path.exists(pi_module):
	print(f" {OK} PageIndex repo found")
	return True
	else:
	print(f" {FAIL} PageIndex repo not found at {pi_dir}")
	print(f" Run: git clone https://github.com/VectifyAI/PageIndex.git")
	return False


	def main():
	parser = argparse.ArgumentParser(description="ECE Compliance RAG health check")
	parser.add_argument("--ollama", action="store_true", help="Only check Ollama")
	args = parser.parse_args()

	if args.ollama:
	print("\nOllama")
	ok = check_ollama()
	sys.exit(0 if ok else 1)

	print("\n" + "=" * 50)
	print("ECE Compliance RAG — System Check")
	print("=" * 50)

	results = {}

	print("\nOllama")
	results["ollama"] = check_ollama()

	print("\nPageIndex")
	results["pageindex"] = check_pageindex()

	print("\nCorpus files")
	results["corpus"] = check_corpus()

	print("\nIndexes")
	results["indexes"] = check_indexes()

	# Summary
	print("\n" + "-" * 50)
	all_ok = all(results.values())
	if all_ok:
	print(f"{OK} All checks passed")
	else:
	failed = [k for k, v in results.items() if not v]
	print(f"{FAIL} Issues found: {', '.join(failed)}")

	print()
	sys.exit(0 if all_ok else 1)


	if __name__ == "__main__":
	main()