Spaces:

csabhay
/

docchat-backend

Sleeping

App Files Files Community

docchat-backend / validate_graphrag.py

csabhay

feat: strict Graph-RAG reasoning engine with JSON path output + validation script

9af036a 28 days ago

raw

history blame contribute delete

4.15 kB

	"""Standalone validation: Graph-RAG engine (no LLM token required)."""
	import sys, json
	sys.path.insert(0, ".")
	from backend.rag import RAGEngine

	engine = RAGEngine(chunk_size=95, chunk_overlap=0)

	doc = (
	"Alpha Dynamics acquired Beta Labs in 2023 to expand its diagnostics portfolio.\n\n"
	"Beta Labs later formed a strategic alliance with Orion Health for hospital analytics.\n\n"
	"Orion Health announced a joint research program with Nova BioSystems focused on predictive care."
	)
	engine.ingest(doc)

	# ── 1. Graph index ────────────────────────────────────────────────────────────
	print("=== GRAPH INDEX ===")
	nodes = len(engine._entity_to_chunks)
	edges = sum(len(v) for v in engine._entity_graph.values()) // 2
	print(f" nodes : {nodes}")
	print(f" edges : {edges}")
	print(f" chunks : {len(engine.chunks)}")
	top = sorted(engine._entity_to_chunks.items(), key=lambda x: len(x[1]), reverse=True)[:5]
	print(" top entities:", [(e, sorted(c)) for e, c in top])
	assert nodes > 0, "Graph must have nodes"
	assert edges > 0, "Graph must have edges"

	# ── 2. Retrieval – 2-hop ──────────────────────────────────────────────────────
	print("\n=== RETRIEVAL: 2-hop ===")
	r = engine.retrieve("How is Orion Health connected to Alpha Dynamics?")
	for c in r:
	print(f" chunk {c.index} score={c.score:.4f} {c.text[:70]}")
	assert any(c.index == 0 for c in r), "Must include Alpha Dynamics chunk"
	assert any(c.index == 1 for c in r), "Must include bridge chunk (Beta Labs)"

	# ── 3. Retrieval – 3-hop ──────────────────────────────────────────────────────
	print("\n=== RETRIEVAL: 3-hop ===")
	r2 = engine.retrieve("What links Nova BioSystems to Alpha Dynamics?")
	for c in r2:
	print(f" chunk {c.index} score={c.score:.4f} {c.text[:70]}")
	assert len(r2) >= 2, "Needs at least 2 bridge chunks for 3-hop"

	# ── 4. JSON parser – clean JSON ───────────────────────────────────────────────
	print("\n=== JSON PARSER ===")
	sample = {
	"answer": "Alpha Dynamics -> Beta Labs -> Orion Health",
	"reasoning_type": "multi-hop",
	"path": ["Alpha Dynamics -> Beta Labs", "Beta Labs -> Orion Health"],
	"used_chunks": ["0", "1"],
	"justification": "Alpha acquired Beta, Beta allied with Orion.",
	}
	p = engine._parse_graph_response(json.dumps(sample))
	assert p["reasoning_type"] == "multi-hop"
	assert len(p["path"]) == 2
	print(f" reasoning_type : {p['reasoning_type']}")
	print(f" path : {p['path']}")
	print(f" used_chunks : {p['used_chunks']}")

	# ── 5. JSON parser – markdown fenced ─────────────────────────────────────────
	fenced = "```json\n" + json.dumps(sample) + "\n```"
	p2 = engine._parse_graph_response(fenced)
	assert p2["reasoning_type"] == "multi-hop", "Must strip markdown fences"
	print(f" fenced input parsed OK : {p2['reasoning_type']}")

	# ── 6. JSON parser – fallback on non-JSON ────────────────────────────────────
	p3 = engine._parse_graph_response("I cannot answer that question.")
	assert p3["reasoning_type"] == "direct"
	print(f" fallback answer: {p3['answer'][:50]}")

	# ── 7. Negative: completely unknown entity (not in graph at all) ──────────────
	print("\n=== RETRIEVAL: negative (unknown entity) ===")
	r3 = engine.retrieve("Who is the founder of ZetaCorp Robotics?")
	_, conf = engine._confidence_from_retrieved(r3)
	print(f" confidence_label: {conf} (expected Low — ZetaCorp is not in graph)")
	assert conf == "Low", "Unknown-entity queries should produce Low confidence"

	print("\nALL ASSERTIONS PASSED - Graph-RAG engine fully validated.")