Spaces:

Meshyboi
/

ConstitutionAgent

Sleeping

App Files Files Community

ConstitutionAgent / tests /test_quality.py

Meshyboi

Upload 53 files

0cd3dc5 verified about 1 month ago

raw

history blame contribute delete

14.5 kB

	import sys
	import os
	import pytest
	import asyncio
	import json
	from datetime import datetime

	# Add project root to path
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from core.graph_workflow import app as graph_app

	# =============================================================================
	# COMPREHENSIVE TEST QUERIES
	# =============================================================================
	# Each query tests a specific edge case in the graph retrieval + reasoning pipeline
	#
	# Categories:
	# 1. DELETED CONTENT - Tests if removed text is properly captured
	# 2. MULTI-HOP - Tests cross-article relationship traversal
	# 3. TEMPORAL EVOLUTION - Tests tracking changes across multiple amendments
	# 4. CROSS-ARTICLE - Tests relationships between different articles
	# 5. CURRENT STATUS - Tests understanding of present-day law
	# 6. COMPARISON - Tests comparing two related provisions
	# 7. SCHEDULE CHANGES - Tests non-article constitutional elements
	# 8. LANDMARK / STRUCK DOWN - Tests amendments that were invalidated
	# 9. INSERTED CONTENT - Tests new articles/parts added
	# 10. FUNDAMENTAL RIGHTS - Tests the most commonly queried area
	# =============================================================================

	QUERIES = [
	# =========================================================================
	# 1. DELETED CONTENT (The original failing case)
	# =========================================================================
	{
	"id": "Q1",
	"category": "DELETED_CONTENT",
	"query": "What changes did the 44th Amendment make to Article 19?",
	"expected_keywords": ["property", "armed rebellion", "internal disturbance", "19(1)(f)"],
	"why": "Tests if deleted text ('internal disturbance') is captured in graph"
	},

	# =========================================================================
	# 2. MULTI-HOP RELATIONSHIPS
	# =========================================================================
	{
	"id": "Q2",
	"category": "MULTI_HOP",
	"query": "How does Article 358 interact with Article 19 during a national emergency?",
	"expected_keywords": ["suspend", "emergency", "fundamental rights", "358", "19"],
	"why": "Tests multi-hop: Amendment → Article 358 → SUSPENDS → Article 19"
	},
	{
	"id": "Q3",
	"category": "MULTI_HOP",
	"query": "What is the relationship between Article 368 and Article 13 after the 24th Amendment?",
	"expected_keywords": ["368", "13", "amendment", "fundamental rights", "constitutional amendment"],
	"why": "Tests multi-hop: 24th Amendment modified Art 13 to exempt Art 368"
	},

	# =========================================================================
	# 3. TEMPORAL EVOLUTION (Same article changed by multiple amendments)
	# =========================================================================
	{
	"id": "Q4",
	"category": "TEMPORAL_EVOLUTION",
	"query": "How has Article 15 been modified across different amendments?",
	"expected_keywords": ["reservation", "backward classes", "15(4)", "103", "economically weaker"],
	"why": "Art 15 modified by Amendments 1, 93, 103 — tests tracking evolution"
	},
	{
	"id": "Q5",
	"category": "TEMPORAL_EVOLUTION",
	"query": "Trace the history of Right to Property from a fundamental right to a legal right",
	"expected_keywords": ["31", "44", "300A", "fundamental", "legal right", "property"],
	"why": "Art 31 (fundamental) → 25th/42nd weakened → 44th repealed → Art 300A (legal)"
	},

	# =========================================================================
	# 4. CROSS-ARTICLE EFFECTS
	# =========================================================================
	{
	"id": "Q6",
	"category": "CROSS_ARTICLE",
	"query": "What was the impact of the 42nd Amendment on both fundamental rights and directive principles?",
	"expected_keywords": ["42", "fundamental", "directive", "31C", "part IV"],
	"why": "42nd gave directive principles primacy over fundamental rights via Art 31C"
	},
	{
	"id": "Q7",
	"category": "CROSS_ARTICLE",
	"query": "How did the 7th Amendment reorganize states and affect territorial provisions?",
	"expected_keywords": ["Article 1", "reorganized states", "Union Territories", "schedule"],
	"why": "7th Am modified Art 1, 3, and First Schedule — tests broad cross-article impact"
	},

	# =========================================================================
	# 5. CURRENT STATUS QUERIES
	# =========================================================================
	{
	"id": "Q8",
	"category": "CURRENT_STATUS",
	"query": "What is the current constitutional provision for the right to education?",
	"expected_keywords": ["21A", "86", "free", "compulsory", "6 to 14"],
	"why": "86th Am inserted Art 21A — tests INSERTED content and current law"
	},
	{
	"id": "Q9",
	"category": "CURRENT_STATUS",
	"query": "What is the current status of reservation for economically weaker sections?",
	"expected_keywords": ["103", "EWS", "10%", "15(6)", "16(6)"],
	"why": "103rd Am added EWS reservation — tests newest amendments"
	},

	# =========================================================================
	# 6. COMPARISON QUERIES
	# =========================================================================
	{
	"id": "Q10",
	"category": "COMPARISON",
	"query": "Compare the changes made by the 42nd and 44th Amendments",
	"expected_keywords": ["42", "44", "emergency", "fundamental rights", "directive principles"],
	"why": "42nd expanded govt power, 44th reversed it — tests comparative reasoning"
	},

	# =========================================================================
	# 7. SCHEDULE & STRUCTURAL CHANGES
	# =========================================================================
	{
	"id": "Q11",
	"category": "SCHEDULE",
	"query": "What is the Tenth Schedule and how was it introduced?",
	"expected_keywords": ["52", "anti-defection", "tenth schedule", "disqualification"],
	"why": "52nd Am inserted 10th Schedule — tests Schedule nodes in graph"
	},

	# =========================================================================
	# 8. STRUCK DOWN / LANDMARK AMENDMENTS
	# =========================================================================
	{
	"id": "Q12",
	"category": "STRUCK_DOWN",
	"query": "What was the NJAC and why was the 99th Amendment struck down?",
	"expected_keywords": ["99", "NJAC", "judicial appointments", "unconstitutional", "collegium"],
	"why": "99th Am struck down by SC — tests handling of invalidated amendments"
	},

	# =========================================================================
	# 9. INSERTED CONTENT (New articles/parts)
	# =========================================================================
	{
	"id": "Q13",
	"category": "INSERTED_CONTENT",
	"query": "What new constitutional framework was created for Panchayati Raj institutions?",
	"expected_keywords": ["73", "Part IX", "243", "panchayat", "reservation"],
	"why": "73rd Am inserted entire Part IX (Art 243-243O) — tests bulk insertion"
	},

	# =========================================================================
	# 10. GST / MODERN REFORMS
	# =========================================================================
	{
	"id": "Q14",
	"category": "MODERN_REFORM",
	"query": "How did the 101st Amendment establish the GST framework?",
	"expected_keywords": ["101", "GST", "246A", "goods and services tax", "concurrent"],
	"why": "101st Am added Art 246A, 269A, modified seventh schedule — tests modern changes"
	},

	# =========================================================================
	# 11. EMERGENCY PROVISIONS (Complex multi-article interaction)
	# =========================================================================
	{
	"id": "Q15",
	"category": "EMERGENCY",
	"query": "How have emergency provisions under Article 352 been modified since the original Constitution?",
	"expected_keywords": ["352", "emergency", "armed rebellion", "internal disturbance", "44"],
	"why": "Art 352 modified multiple times — tests temporal + deleted content together"
	},
	]

	# =============================================================================
	# TEST RUNNER
	# =============================================================================

	@pytest.mark.asyncio
	async def test_quality_answers():
	"""Run all test queries and generate a detailed report"""

	results = []
	passed = 0
	failed = 0

	for q in QUERIES:
	print(f"\n{'='*70}")
	print(f"🧪 [{q['id']}] ({q['category']}) Testing: '{q['query']}'")
	print(f" Why: {q['why']}")
	print(f"{'='*70}")

	start_time = datetime.now()

	initial_state = {"query": q['query'], "retry_count": 0}
	result = await graph_app.ainvoke(initial_state)

	elapsed = (datetime.now() - start_time).total_seconds()

	answer_text = result.get("draft_answer", {}).get("answer", "").lower()
	tree_scope = result.get("qdrant_scope", {})
	trace = result.get("trace", [])
	confidence = result.get("critique", {}).get("final_confidence", 0)

	# Check graph completeness from the trace
	had_retry = "Validate (graph incomplete → forced retry)" in trace

	# Check expected keywords
	missing = []
	found = []
	for keyword in q['expected_keywords']:
	if keyword.lower() in answer_text:
	found.append(keyword)
	else:
	missing.append(keyword)

	keyword_score = len(found) / len(q['expected_keywords']) * 100
	test_pass = keyword_score >= 60 # Pass if at least 60% keywords found

	if test_pass:
	passed += 1
	status = "✅ PASS"
	else:
	failed += 1
	status = "❌ FAIL"

	# Print results
	print(f"\n{status} \| Keywords: {keyword_score:.0f}% \| Confidence: {confidence} \| Time: {elapsed:.1f}s")
	print(f" Found: {found}")
	if missing:
	print(f" Missing: {missing}")
	if had_retry:
	print(f" ⚠️ Graph was incomplete → forced re-retrieval")
	print(f" Trace: {' → '.join(trace)}")

	print(f"\n--- ANSWER PREVIEW (first 500 chars) ---")
	print(answer_text[:500])
	print(f"--- END PREVIEW ---")

	results.append({
	"id": q["id"],
	"category": q["category"],
	"query": q["query"],
	"status": "PASS" if test_pass else "FAIL",
	"keyword_score": keyword_score,
	"confidence": confidence,
	"missing": missing,
	"time_seconds": elapsed,
	"had_retry": had_retry,
	"trace": trace
	})

	# ==========================================================================
	# SUMMARY REPORT
	# ==========================================================================
	print(f"\n\n{'='*70}")
	print(f"📊 TEST SUMMARY REPORT")
	print(f"{'='*70}")
	print(f"Total: {len(QUERIES)} \| Passed: {passed} \| Failed: {failed}")
	print(f"Pass Rate: {passed/len(QUERIES)*100:.0f}%")
	print()

	# Category breakdown
	categories = {}
	for r in results:
	cat = r["category"]
	if cat not in categories:
	categories[cat] = {"pass": 0, "fail": 0}
	if r["status"] == "PASS":
	categories[cat]["pass"] += 1
	else:
	categories[cat]["fail"] += 1

	print(f"{'Category':<25} {'Pass':<6} {'Fail':<6} {'Rate':<8}")
	print("-" * 45)
	for cat, counts in sorted(categories.items()):
	total = counts["pass"] + counts["fail"]
	rate = counts["pass"] / total * 100
	print(f"{cat:<25} {counts['pass']:<6} {counts['fail']:<6} {rate:.0f}%")

	# Failed tests detail
	failures = [r for r in results if r["status"] == "FAIL"]
	if failures:
	print(f"\n❌ FAILED TESTS:")
	for f in failures:
	print(f" [{f['id']}] {f['query']}")
	print(f" Missing: {f['missing']}")
	print(f" Confidence: {f['confidence']}, Had retry: {f['had_retry']}")

	# Save report to file
	report_path = os.path.join(os.path.dirname(__file__), "test_report.json")
	with open(report_path, 'w') as f:
	json.dump({"summary": {"total": len(QUERIES), "passed": passed, "failed": failed}, "results": results}, f, indent=2)
	print(f"\n📄 Full report saved to: {report_path}")

	# =============================================================================
	# QUICK TEST - Run a single query for debugging
	# =============================================================================
	async def test_single(query_id: str):
	"""Run a single test query by ID (e.g., 'Q1')"""
	q = next((q for q in QUERIES if q["id"] == query_id), None)
	if not q:
	print(f"Query {query_id} not found. Available: {[q['id'] for q in QUERIES]}")
	return

	print(f"🧪 Testing: {q['query']}")
	result = await graph_app.ainvoke({"query": q['query'], "retry_count": 0})

	answer = result.get("draft_answer", {}).get("answer", "")
	print(f"\n=== ANSWER ===\n{answer}\n{'='*60}")

	missing = [k for k in q['expected_keywords'] if k.lower() not in answer.lower()]
	if missing:
	print(f"❌ Missing: {missing}")
	else:
	print("✅ All keywords found!")

	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser()
	parser.add_argument("--query", "-q", help="Run single query by ID (e.g. Q1)", default=None)
	args = parser.parse_args()

	if args.query:
	asyncio.run(test_single(args.query))
	else:
	asyncio.run(test_quality_answers())