Spaces:

webmuppetnz
/

hmc-rag

Running

File size: 5,742 Bytes

bad8b6c

"""
Health check for ECE Compliance RAG system.

Checks Ollama connectivity, model availability, corpus files, and index freshness.

Usage:
    uv run python scripts/check.py          # full check
    uv run python scripts/check.py --ollama # just Ollama check
"""

import os
import sys
import json
import argparse
import urllib.request
import urllib.error

PROJECT_ROOT = os.path.join(os.path.dirname(__file__), "..")
CORPUS_DIR = os.path.join(PROJECT_ROOT, "corpus")
INDEX_DIR = os.path.join(PROJECT_ROOT, "indexes")
OLLAMA_URL = "http://localhost:11434"
REQUIRED_MODEL = "qwen2.5:14b"

DOMAINS = {
    "legislation": "legislation",
    "licensing-criteria": "licensing-criteria",
    "ero": "ero",
    "cross-regulator": "cross-regulator",
    "reform-context": "reform-context",
}

OK = "\033[32m✓\033[0m"
FAIL = "\033[31m✗\033[0m"
WARN = "\033[33m!\033[0m"


def check_ollama():
    """Check Ollama is running and model is loaded."""
    # 1. Is Ollama reachable?
    try:
        req = urllib.request.Request(f"{OLLAMA_URL}/api/tags")
        with urllib.request.urlopen(req, timeout=5) as resp:
            data = json.loads(resp.read())
    except (urllib.error.URLError, ConnectionRefusedError, OSError):
        print(f"  {FAIL} Ollama not reachable at {OLLAMA_URL}")
        print(f"    Run: ollama serve")
        return False

    print(f"  {OK} Ollama running at {OLLAMA_URL}")

    # 2. Is the model available?
    models = [m.get("name", "") for m in data.get("models", [])]
    model_found = any(REQUIRED_MODEL in m for m in models)

    if model_found:
        print(f"  {OK} Model {REQUIRED_MODEL} available")
    else:
        print(f"  {FAIL} Model {REQUIRED_MODEL} not found")
        print(f"    Available: {', '.join(models) or 'none'}")
        print(f"    Run: ollama pull {REQUIRED_MODEL}")
        return False

    return True


def check_corpus():
    """Check corpus files exist and show sizes."""
    all_ok = True
    for name, slug in DOMAINS.items():
        path = os.path.join(CORPUS_DIR, f"{slug}.md")
        if os.path.exists(path):
            size_kb = os.path.getsize(path) / 1024
            with open(path, "r") as f:
                lines = sum(1 for _ in f)
            marker = OK if size_kb > 1 else WARN
            print(f"  {marker} {name:20s} {size_kb:6.0f} KB  {lines:5d} lines")
        else:
            print(f"  {FAIL} {name:20s} NOT FOUND")
            all_ok = False
    return all_ok


def check_indexes():
    """Check index files exist, show sizes, and flag stale ones."""
    all_ok = True
    for name, slug in DOMAINS.items():
        idx_path = os.path.join(INDEX_DIR, f"{slug}.json")
        corpus_path = os.path.join(CORPUS_DIR, f"{slug}.md")

        if not os.path.exists(idx_path):
            print(f"  {FAIL} {name:20s} NOT FOUND")
            all_ok = False
            continue

        size_kb = os.path.getsize(idx_path) / 1024

        # Check if index is older than corpus (stale)
        stale = False
        if os.path.exists(corpus_path):
            idx_mtime = os.path.getmtime(idx_path)
            corpus_mtime = os.path.getmtime(corpus_path)
            if idx_mtime < corpus_mtime:
                stale = True

        # Check if index has doc_description
        try:
            with open(idx_path) as f:
                tree = json.load(f)
            has_desc = bool(tree.get("doc_description"))
            node_count = len(_count_nodes(tree.get("structure", [])))
        except Exception:
            has_desc = False
            node_count = 0

        if stale:
            marker = WARN
            suffix = " STALE (corpus is newer)"
        elif not has_desc:
            marker = WARN
            suffix = " (no doc_description)"
        else:
            marker = OK
            suffix = ""

        print(f"  {marker} {name:20s} {size_kb:6.0f} KB  {node_count:3d} nodes{suffix}")

        if stale:
            all_ok = False

    return all_ok


def _count_nodes(structure):
    """Count all nodes in a PageIndex tree structure."""
    nodes = []
    for item in structure:
        nodes.append(item)
        if "nodes" in item and item["nodes"]:
            nodes.extend(_count_nodes(item["nodes"]))
    return nodes


def check_pageindex():
    """Check PageIndex repo is cloned."""
    pi_dir = os.path.join(PROJECT_ROOT, "PageIndex")
    pi_module = os.path.join(pi_dir, "pageindex", "page_index_md.py")
    if os.path.exists(pi_module):
        print(f"  {OK} PageIndex repo found")
        return True
    else:
        print(f"  {FAIL} PageIndex repo not found at {pi_dir}")
        print(f"    Run: git clone https://github.com/VectifyAI/PageIndex.git")
        return False


def main():
    parser = argparse.ArgumentParser(description="ECE Compliance RAG health check")
    parser.add_argument("--ollama", action="store_true", help="Only check Ollama")
    args = parser.parse_args()

    if args.ollama:
        print("\nOllama")
        ok = check_ollama()
        sys.exit(0 if ok else 1)

    print("\n" + "=" * 50)
    print("ECE Compliance RAG — System Check")
    print("=" * 50)

    results = {}

    print("\nOllama")
    results["ollama"] = check_ollama()

    print("\nPageIndex")
    results["pageindex"] = check_pageindex()

    print("\nCorpus files")
    results["corpus"] = check_corpus()

    print("\nIndexes")
    results["indexes"] = check_indexes()

    # Summary
    print("\n" + "-" * 50)
    all_ok = all(results.values())
    if all_ok:
        print(f"{OK} All checks passed")
    else:
        failed = [k for k, v in results.items() if not v]
        print(f"{FAIL} Issues found: {', '.join(failed)}")

    print()
    sys.exit(0 if all_ok else 1)


if __name__ == "__main__":
    main()