ConstitutionAgent / tests /test_quality.py
Meshyboi's picture
Upload 53 files
0cd3dc5 verified
import sys
import os
import pytest
import asyncio
import json
from datetime import datetime
# Add project root to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.graph_workflow import app as graph_app
# =============================================================================
# COMPREHENSIVE TEST QUERIES
# =============================================================================
# Each query tests a specific edge case in the graph retrieval + reasoning pipeline
#
# Categories:
# 1. DELETED CONTENT - Tests if removed text is properly captured
# 2. MULTI-HOP - Tests cross-article relationship traversal
# 3. TEMPORAL EVOLUTION - Tests tracking changes across multiple amendments
# 4. CROSS-ARTICLE - Tests relationships between different articles
# 5. CURRENT STATUS - Tests understanding of present-day law
# 6. COMPARISON - Tests comparing two related provisions
# 7. SCHEDULE CHANGES - Tests non-article constitutional elements
# 8. LANDMARK / STRUCK DOWN - Tests amendments that were invalidated
# 9. INSERTED CONTENT - Tests new articles/parts added
# 10. FUNDAMENTAL RIGHTS - Tests the most commonly queried area
# =============================================================================
QUERIES = [
# =========================================================================
# 1. DELETED CONTENT (The original failing case)
# =========================================================================
{
"id": "Q1",
"category": "DELETED_CONTENT",
"query": "What changes did the 44th Amendment make to Article 19?",
"expected_keywords": ["property", "armed rebellion", "internal disturbance", "19(1)(f)"],
"why": "Tests if deleted text ('internal disturbance') is captured in graph"
},
# =========================================================================
# 2. MULTI-HOP RELATIONSHIPS
# =========================================================================
{
"id": "Q2",
"category": "MULTI_HOP",
"query": "How does Article 358 interact with Article 19 during a national emergency?",
"expected_keywords": ["suspend", "emergency", "fundamental rights", "358", "19"],
"why": "Tests multi-hop: Amendment β†’ Article 358 β†’ SUSPENDS β†’ Article 19"
},
{
"id": "Q3",
"category": "MULTI_HOP",
"query": "What is the relationship between Article 368 and Article 13 after the 24th Amendment?",
"expected_keywords": ["368", "13", "amendment", "fundamental rights", "constitutional amendment"],
"why": "Tests multi-hop: 24th Amendment modified Art 13 to exempt Art 368"
},
# =========================================================================
# 3. TEMPORAL EVOLUTION (Same article changed by multiple amendments)
# =========================================================================
{
"id": "Q4",
"category": "TEMPORAL_EVOLUTION",
"query": "How has Article 15 been modified across different amendments?",
"expected_keywords": ["reservation", "backward classes", "15(4)", "103", "economically weaker"],
"why": "Art 15 modified by Amendments 1, 93, 103 β€” tests tracking evolution"
},
{
"id": "Q5",
"category": "TEMPORAL_EVOLUTION",
"query": "Trace the history of Right to Property from a fundamental right to a legal right",
"expected_keywords": ["31", "44", "300A", "fundamental", "legal right", "property"],
"why": "Art 31 (fundamental) β†’ 25th/42nd weakened β†’ 44th repealed β†’ Art 300A (legal)"
},
# =========================================================================
# 4. CROSS-ARTICLE EFFECTS
# =========================================================================
{
"id": "Q6",
"category": "CROSS_ARTICLE",
"query": "What was the impact of the 42nd Amendment on both fundamental rights and directive principles?",
"expected_keywords": ["42", "fundamental", "directive", "31C", "part IV"],
"why": "42nd gave directive principles primacy over fundamental rights via Art 31C"
},
{
"id": "Q7",
"category": "CROSS_ARTICLE",
"query": "How did the 7th Amendment reorganize states and affect territorial provisions?",
"expected_keywords": ["Article 1", "reorganized states", "Union Territories", "schedule"],
"why": "7th Am modified Art 1, 3, and First Schedule β€” tests broad cross-article impact"
},
# =========================================================================
# 5. CURRENT STATUS QUERIES
# =========================================================================
{
"id": "Q8",
"category": "CURRENT_STATUS",
"query": "What is the current constitutional provision for the right to education?",
"expected_keywords": ["21A", "86", "free", "compulsory", "6 to 14"],
"why": "86th Am inserted Art 21A β€” tests INSERTED content and current law"
},
{
"id": "Q9",
"category": "CURRENT_STATUS",
"query": "What is the current status of reservation for economically weaker sections?",
"expected_keywords": ["103", "EWS", "10%", "15(6)", "16(6)"],
"why": "103rd Am added EWS reservation β€” tests newest amendments"
},
# =========================================================================
# 6. COMPARISON QUERIES
# =========================================================================
{
"id": "Q10",
"category": "COMPARISON",
"query": "Compare the changes made by the 42nd and 44th Amendments",
"expected_keywords": ["42", "44", "emergency", "fundamental rights", "directive principles"],
"why": "42nd expanded govt power, 44th reversed it β€” tests comparative reasoning"
},
# =========================================================================
# 7. SCHEDULE & STRUCTURAL CHANGES
# =========================================================================
{
"id": "Q11",
"category": "SCHEDULE",
"query": "What is the Tenth Schedule and how was it introduced?",
"expected_keywords": ["52", "anti-defection", "tenth schedule", "disqualification"],
"why": "52nd Am inserted 10th Schedule β€” tests Schedule nodes in graph"
},
# =========================================================================
# 8. STRUCK DOWN / LANDMARK AMENDMENTS
# =========================================================================
{
"id": "Q12",
"category": "STRUCK_DOWN",
"query": "What was the NJAC and why was the 99th Amendment struck down?",
"expected_keywords": ["99", "NJAC", "judicial appointments", "unconstitutional", "collegium"],
"why": "99th Am struck down by SC β€” tests handling of invalidated amendments"
},
# =========================================================================
# 9. INSERTED CONTENT (New articles/parts)
# =========================================================================
{
"id": "Q13",
"category": "INSERTED_CONTENT",
"query": "What new constitutional framework was created for Panchayati Raj institutions?",
"expected_keywords": ["73", "Part IX", "243", "panchayat", "reservation"],
"why": "73rd Am inserted entire Part IX (Art 243-243O) β€” tests bulk insertion"
},
# =========================================================================
# 10. GST / MODERN REFORMS
# =========================================================================
{
"id": "Q14",
"category": "MODERN_REFORM",
"query": "How did the 101st Amendment establish the GST framework?",
"expected_keywords": ["101", "GST", "246A", "goods and services tax", "concurrent"],
"why": "101st Am added Art 246A, 269A, modified seventh schedule β€” tests modern changes"
},
# =========================================================================
# 11. EMERGENCY PROVISIONS (Complex multi-article interaction)
# =========================================================================
{
"id": "Q15",
"category": "EMERGENCY",
"query": "How have emergency provisions under Article 352 been modified since the original Constitution?",
"expected_keywords": ["352", "emergency", "armed rebellion", "internal disturbance", "44"],
"why": "Art 352 modified multiple times β€” tests temporal + deleted content together"
},
]
# =============================================================================
# TEST RUNNER
# =============================================================================
@pytest.mark.asyncio
async def test_quality_answers():
"""Run all test queries and generate a detailed report"""
results = []
passed = 0
failed = 0
for q in QUERIES:
print(f"\n{'='*70}")
print(f"πŸ§ͺ [{q['id']}] ({q['category']}) Testing: '{q['query']}'")
print(f" Why: {q['why']}")
print(f"{'='*70}")
start_time = datetime.now()
initial_state = {"query": q['query'], "retry_count": 0}
result = await graph_app.ainvoke(initial_state)
elapsed = (datetime.now() - start_time).total_seconds()
answer_text = result.get("draft_answer", {}).get("answer", "").lower()
tree_scope = result.get("qdrant_scope", {})
trace = result.get("trace", [])
confidence = result.get("critique", {}).get("final_confidence", 0)
# Check graph completeness from the trace
had_retry = "Validate (graph incomplete β†’ forced retry)" in trace
# Check expected keywords
missing = []
found = []
for keyword in q['expected_keywords']:
if keyword.lower() in answer_text:
found.append(keyword)
else:
missing.append(keyword)
keyword_score = len(found) / len(q['expected_keywords']) * 100
test_pass = keyword_score >= 60 # Pass if at least 60% keywords found
if test_pass:
passed += 1
status = "βœ… PASS"
else:
failed += 1
status = "❌ FAIL"
# Print results
print(f"\n{status} | Keywords: {keyword_score:.0f}% | Confidence: {confidence} | Time: {elapsed:.1f}s")
print(f" Found: {found}")
if missing:
print(f" Missing: {missing}")
if had_retry:
print(f" ⚠️ Graph was incomplete β†’ forced re-retrieval")
print(f" Trace: {' β†’ '.join(trace)}")
print(f"\n--- ANSWER PREVIEW (first 500 chars) ---")
print(answer_text[:500])
print(f"--- END PREVIEW ---")
results.append({
"id": q["id"],
"category": q["category"],
"query": q["query"],
"status": "PASS" if test_pass else "FAIL",
"keyword_score": keyword_score,
"confidence": confidence,
"missing": missing,
"time_seconds": elapsed,
"had_retry": had_retry,
"trace": trace
})
# ==========================================================================
# SUMMARY REPORT
# ==========================================================================
print(f"\n\n{'='*70}")
print(f"πŸ“Š TEST SUMMARY REPORT")
print(f"{'='*70}")
print(f"Total: {len(QUERIES)} | Passed: {passed} | Failed: {failed}")
print(f"Pass Rate: {passed/len(QUERIES)*100:.0f}%")
print()
# Category breakdown
categories = {}
for r in results:
cat = r["category"]
if cat not in categories:
categories[cat] = {"pass": 0, "fail": 0}
if r["status"] == "PASS":
categories[cat]["pass"] += 1
else:
categories[cat]["fail"] += 1
print(f"{'Category':<25} {'Pass':<6} {'Fail':<6} {'Rate':<8}")
print("-" * 45)
for cat, counts in sorted(categories.items()):
total = counts["pass"] + counts["fail"]
rate = counts["pass"] / total * 100
print(f"{cat:<25} {counts['pass']:<6} {counts['fail']:<6} {rate:.0f}%")
# Failed tests detail
failures = [r for r in results if r["status"] == "FAIL"]
if failures:
print(f"\n❌ FAILED TESTS:")
for f in failures:
print(f" [{f['id']}] {f['query']}")
print(f" Missing: {f['missing']}")
print(f" Confidence: {f['confidence']}, Had retry: {f['had_retry']}")
# Save report to file
report_path = os.path.join(os.path.dirname(__file__), "test_report.json")
with open(report_path, 'w') as f:
json.dump({"summary": {"total": len(QUERIES), "passed": passed, "failed": failed}, "results": results}, f, indent=2)
print(f"\nπŸ“„ Full report saved to: {report_path}")
# =============================================================================
# QUICK TEST - Run a single query for debugging
# =============================================================================
async def test_single(query_id: str):
"""Run a single test query by ID (e.g., 'Q1')"""
q = next((q for q in QUERIES if q["id"] == query_id), None)
if not q:
print(f"Query {query_id} not found. Available: {[q['id'] for q in QUERIES]}")
return
print(f"πŸ§ͺ Testing: {q['query']}")
result = await graph_app.ainvoke({"query": q['query'], "retry_count": 0})
answer = result.get("draft_answer", {}).get("answer", "")
print(f"\n=== ANSWER ===\n{answer}\n{'='*60}")
missing = [k for k in q['expected_keywords'] if k.lower() not in answer.lower()]
if missing:
print(f"❌ Missing: {missing}")
else:
print("βœ… All keywords found!")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--query", "-q", help="Run single query by ID (e.g. Q1)", default=None)
args = parser.parse_args()
if args.query:
asyncio.run(test_single(args.query))
else:
asyncio.run(test_quality_answers())