docchat-backend / validate_graphrag.py
csabhay's picture
feat: strict Graph-RAG reasoning engine with JSON path output + validation script
9af036a
"""Standalone validation: Graph-RAG engine (no LLM token required)."""
import sys, json
sys.path.insert(0, ".")
from backend.rag import RAGEngine
engine = RAGEngine(chunk_size=95, chunk_overlap=0)
doc = (
"Alpha Dynamics acquired Beta Labs in 2023 to expand its diagnostics portfolio.\n\n"
"Beta Labs later formed a strategic alliance with Orion Health for hospital analytics.\n\n"
"Orion Health announced a joint research program with Nova BioSystems focused on predictive care."
)
engine.ingest(doc)
# ── 1. Graph index ────────────────────────────────────────────────────────────
print("=== GRAPH INDEX ===")
nodes = len(engine._entity_to_chunks)
edges = sum(len(v) for v in engine._entity_graph.values()) // 2
print(f" nodes : {nodes}")
print(f" edges : {edges}")
print(f" chunks : {len(engine.chunks)}")
top = sorted(engine._entity_to_chunks.items(), key=lambda x: len(x[1]), reverse=True)[:5]
print(" top entities:", [(e, sorted(c)) for e, c in top])
assert nodes > 0, "Graph must have nodes"
assert edges > 0, "Graph must have edges"
# ── 2. Retrieval – 2-hop ──────────────────────────────────────────────────────
print("\n=== RETRIEVAL: 2-hop ===")
r = engine.retrieve("How is Orion Health connected to Alpha Dynamics?")
for c in r:
print(f" chunk {c.index} score={c.score:.4f} {c.text[:70]}")
assert any(c.index == 0 for c in r), "Must include Alpha Dynamics chunk"
assert any(c.index == 1 for c in r), "Must include bridge chunk (Beta Labs)"
# ── 3. Retrieval – 3-hop ──────────────────────────────────────────────────────
print("\n=== RETRIEVAL: 3-hop ===")
r2 = engine.retrieve("What links Nova BioSystems to Alpha Dynamics?")
for c in r2:
print(f" chunk {c.index} score={c.score:.4f} {c.text[:70]}")
assert len(r2) >= 2, "Needs at least 2 bridge chunks for 3-hop"
# ── 4. JSON parser – clean JSON ───────────────────────────────────────────────
print("\n=== JSON PARSER ===")
sample = {
"answer": "Alpha Dynamics -> Beta Labs -> Orion Health",
"reasoning_type": "multi-hop",
"path": ["Alpha Dynamics -> Beta Labs", "Beta Labs -> Orion Health"],
"used_chunks": ["0", "1"],
"justification": "Alpha acquired Beta, Beta allied with Orion.",
}
p = engine._parse_graph_response(json.dumps(sample))
assert p["reasoning_type"] == "multi-hop"
assert len(p["path"]) == 2
print(f" reasoning_type : {p['reasoning_type']}")
print(f" path : {p['path']}")
print(f" used_chunks : {p['used_chunks']}")
# ── 5. JSON parser – markdown fenced ─────────────────────────────────────────
fenced = "```json\n" + json.dumps(sample) + "\n```"
p2 = engine._parse_graph_response(fenced)
assert p2["reasoning_type"] == "multi-hop", "Must strip markdown fences"
print(f" fenced input parsed OK : {p2['reasoning_type']}")
# ── 6. JSON parser – fallback on non-JSON ────────────────────────────────────
p3 = engine._parse_graph_response("I cannot answer that question.")
assert p3["reasoning_type"] == "direct"
print(f" fallback answer: {p3['answer'][:50]}")
# ── 7. Negative: completely unknown entity (not in graph at all) ──────────────
print("\n=== RETRIEVAL: negative (unknown entity) ===")
r3 = engine.retrieve("Who is the founder of ZetaCorp Robotics?")
_, conf = engine._confidence_from_retrieved(r3)
print(f" confidence_label: {conf} (expected Low β€” ZetaCorp is not in graph)")
assert conf == "Low", "Unknown-entity queries should produce Low confidence"
print("\nALL ASSERTIONS PASSED - Graph-RAG engine fully validated.")