Spaces:
Running
Running
File size: 5,742 Bytes
bad8b6c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | """
Health check for ECE Compliance RAG system.
Checks Ollama connectivity, model availability, corpus files, and index freshness.
Usage:
uv run python scripts/check.py # full check
uv run python scripts/check.py --ollama # just Ollama check
"""
import os
import sys
import json
import argparse
import urllib.request
import urllib.error
PROJECT_ROOT = os.path.join(os.path.dirname(__file__), "..")
CORPUS_DIR = os.path.join(PROJECT_ROOT, "corpus")
INDEX_DIR = os.path.join(PROJECT_ROOT, "indexes")
OLLAMA_URL = "http://localhost:11434"
REQUIRED_MODEL = "qwen2.5:14b"
DOMAINS = {
"legislation": "legislation",
"licensing-criteria": "licensing-criteria",
"ero": "ero",
"cross-regulator": "cross-regulator",
"reform-context": "reform-context",
}
OK = "\033[32m✓\033[0m"
FAIL = "\033[31m✗\033[0m"
WARN = "\033[33m!\033[0m"
def check_ollama():
"""Check Ollama is running and model is loaded."""
# 1. Is Ollama reachable?
try:
req = urllib.request.Request(f"{OLLAMA_URL}/api/tags")
with urllib.request.urlopen(req, timeout=5) as resp:
data = json.loads(resp.read())
except (urllib.error.URLError, ConnectionRefusedError, OSError):
print(f" {FAIL} Ollama not reachable at {OLLAMA_URL}")
print(f" Run: ollama serve")
return False
print(f" {OK} Ollama running at {OLLAMA_URL}")
# 2. Is the model available?
models = [m.get("name", "") for m in data.get("models", [])]
model_found = any(REQUIRED_MODEL in m for m in models)
if model_found:
print(f" {OK} Model {REQUIRED_MODEL} available")
else:
print(f" {FAIL} Model {REQUIRED_MODEL} not found")
print(f" Available: {', '.join(models) or 'none'}")
print(f" Run: ollama pull {REQUIRED_MODEL}")
return False
return True
def check_corpus():
"""Check corpus files exist and show sizes."""
all_ok = True
for name, slug in DOMAINS.items():
path = os.path.join(CORPUS_DIR, f"{slug}.md")
if os.path.exists(path):
size_kb = os.path.getsize(path) / 1024
with open(path, "r") as f:
lines = sum(1 for _ in f)
marker = OK if size_kb > 1 else WARN
print(f" {marker} {name:20s} {size_kb:6.0f} KB {lines:5d} lines")
else:
print(f" {FAIL} {name:20s} NOT FOUND")
all_ok = False
return all_ok
def check_indexes():
"""Check index files exist, show sizes, and flag stale ones."""
all_ok = True
for name, slug in DOMAINS.items():
idx_path = os.path.join(INDEX_DIR, f"{slug}.json")
corpus_path = os.path.join(CORPUS_DIR, f"{slug}.md")
if not os.path.exists(idx_path):
print(f" {FAIL} {name:20s} NOT FOUND")
all_ok = False
continue
size_kb = os.path.getsize(idx_path) / 1024
# Check if index is older than corpus (stale)
stale = False
if os.path.exists(corpus_path):
idx_mtime = os.path.getmtime(idx_path)
corpus_mtime = os.path.getmtime(corpus_path)
if idx_mtime < corpus_mtime:
stale = True
# Check if index has doc_description
try:
with open(idx_path) as f:
tree = json.load(f)
has_desc = bool(tree.get("doc_description"))
node_count = len(_count_nodes(tree.get("structure", [])))
except Exception:
has_desc = False
node_count = 0
if stale:
marker = WARN
suffix = " STALE (corpus is newer)"
elif not has_desc:
marker = WARN
suffix = " (no doc_description)"
else:
marker = OK
suffix = ""
print(f" {marker} {name:20s} {size_kb:6.0f} KB {node_count:3d} nodes{suffix}")
if stale:
all_ok = False
return all_ok
def _count_nodes(structure):
"""Count all nodes in a PageIndex tree structure."""
nodes = []
for item in structure:
nodes.append(item)
if "nodes" in item and item["nodes"]:
nodes.extend(_count_nodes(item["nodes"]))
return nodes
def check_pageindex():
"""Check PageIndex repo is cloned."""
pi_dir = os.path.join(PROJECT_ROOT, "PageIndex")
pi_module = os.path.join(pi_dir, "pageindex", "page_index_md.py")
if os.path.exists(pi_module):
print(f" {OK} PageIndex repo found")
return True
else:
print(f" {FAIL} PageIndex repo not found at {pi_dir}")
print(f" Run: git clone https://github.com/VectifyAI/PageIndex.git")
return False
def main():
parser = argparse.ArgumentParser(description="ECE Compliance RAG health check")
parser.add_argument("--ollama", action="store_true", help="Only check Ollama")
args = parser.parse_args()
if args.ollama:
print("\nOllama")
ok = check_ollama()
sys.exit(0 if ok else 1)
print("\n" + "=" * 50)
print("ECE Compliance RAG — System Check")
print("=" * 50)
results = {}
print("\nOllama")
results["ollama"] = check_ollama()
print("\nPageIndex")
results["pageindex"] = check_pageindex()
print("\nCorpus files")
results["corpus"] = check_corpus()
print("\nIndexes")
results["indexes"] = check_indexes()
# Summary
print("\n" + "-" * 50)
all_ok = all(results.values())
if all_ok:
print(f"{OK} All checks passed")
else:
failed = [k for k, v in results.items() if not v]
print(f"{FAIL} Issues found: {', '.join(failed)}")
print()
sys.exit(0 if all_ok else 1)
if __name__ == "__main__":
main()
|