import sys import os import pytest import asyncio import json from datetime import datetime # Add project root to path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from core.graph_workflow import app as graph_app # ============================================================================= # COMPREHENSIVE TEST QUERIES # ============================================================================= # Each query tests a specific edge case in the graph retrieval + reasoning pipeline # # Categories: # 1. DELETED CONTENT - Tests if removed text is properly captured # 2. MULTI-HOP - Tests cross-article relationship traversal # 3. TEMPORAL EVOLUTION - Tests tracking changes across multiple amendments # 4. CROSS-ARTICLE - Tests relationships between different articles # 5. CURRENT STATUS - Tests understanding of present-day law # 6. COMPARISON - Tests comparing two related provisions # 7. SCHEDULE CHANGES - Tests non-article constitutional elements # 8. LANDMARK / STRUCK DOWN - Tests amendments that were invalidated # 9. INSERTED CONTENT - Tests new articles/parts added # 10. FUNDAMENTAL RIGHTS - Tests the most commonly queried area # ============================================================================= QUERIES = [ # ========================================================================= # 1. DELETED CONTENT (The original failing case) # ========================================================================= { "id": "Q1", "category": "DELETED_CONTENT", "query": "What changes did the 44th Amendment make to Article 19?", "expected_keywords": ["property", "armed rebellion", "internal disturbance", "19(1)(f)"], "why": "Tests if deleted text ('internal disturbance') is captured in graph" }, # ========================================================================= # 2. MULTI-HOP RELATIONSHIPS # ========================================================================= { "id": "Q2", "category": "MULTI_HOP", "query": "How does Article 358 interact with Article 19 during a national emergency?", "expected_keywords": ["suspend", "emergency", "fundamental rights", "358", "19"], "why": "Tests multi-hop: Amendment → Article 358 → SUSPENDS → Article 19" }, { "id": "Q3", "category": "MULTI_HOP", "query": "What is the relationship between Article 368 and Article 13 after the 24th Amendment?", "expected_keywords": ["368", "13", "amendment", "fundamental rights", "constitutional amendment"], "why": "Tests multi-hop: 24th Amendment modified Art 13 to exempt Art 368" }, # ========================================================================= # 3. TEMPORAL EVOLUTION (Same article changed by multiple amendments) # ========================================================================= { "id": "Q4", "category": "TEMPORAL_EVOLUTION", "query": "How has Article 15 been modified across different amendments?", "expected_keywords": ["reservation", "backward classes", "15(4)", "103", "economically weaker"], "why": "Art 15 modified by Amendments 1, 93, 103 — tests tracking evolution" }, { "id": "Q5", "category": "TEMPORAL_EVOLUTION", "query": "Trace the history of Right to Property from a fundamental right to a legal right", "expected_keywords": ["31", "44", "300A", "fundamental", "legal right", "property"], "why": "Art 31 (fundamental) → 25th/42nd weakened → 44th repealed → Art 300A (legal)" }, # ========================================================================= # 4. CROSS-ARTICLE EFFECTS # ========================================================================= { "id": "Q6", "category": "CROSS_ARTICLE", "query": "What was the impact of the 42nd Amendment on both fundamental rights and directive principles?", "expected_keywords": ["42", "fundamental", "directive", "31C", "part IV"], "why": "42nd gave directive principles primacy over fundamental rights via Art 31C" }, { "id": "Q7", "category": "CROSS_ARTICLE", "query": "How did the 7th Amendment reorganize states and affect territorial provisions?", "expected_keywords": ["Article 1", "reorganized states", "Union Territories", "schedule"], "why": "7th Am modified Art 1, 3, and First Schedule — tests broad cross-article impact" }, # ========================================================================= # 5. CURRENT STATUS QUERIES # ========================================================================= { "id": "Q8", "category": "CURRENT_STATUS", "query": "What is the current constitutional provision for the right to education?", "expected_keywords": ["21A", "86", "free", "compulsory", "6 to 14"], "why": "86th Am inserted Art 21A — tests INSERTED content and current law" }, { "id": "Q9", "category": "CURRENT_STATUS", "query": "What is the current status of reservation for economically weaker sections?", "expected_keywords": ["103", "EWS", "10%", "15(6)", "16(6)"], "why": "103rd Am added EWS reservation — tests newest amendments" }, # ========================================================================= # 6. COMPARISON QUERIES # ========================================================================= { "id": "Q10", "category": "COMPARISON", "query": "Compare the changes made by the 42nd and 44th Amendments", "expected_keywords": ["42", "44", "emergency", "fundamental rights", "directive principles"], "why": "42nd expanded govt power, 44th reversed it — tests comparative reasoning" }, # ========================================================================= # 7. SCHEDULE & STRUCTURAL CHANGES # ========================================================================= { "id": "Q11", "category": "SCHEDULE", "query": "What is the Tenth Schedule and how was it introduced?", "expected_keywords": ["52", "anti-defection", "tenth schedule", "disqualification"], "why": "52nd Am inserted 10th Schedule — tests Schedule nodes in graph" }, # ========================================================================= # 8. STRUCK DOWN / LANDMARK AMENDMENTS # ========================================================================= { "id": "Q12", "category": "STRUCK_DOWN", "query": "What was the NJAC and why was the 99th Amendment struck down?", "expected_keywords": ["99", "NJAC", "judicial appointments", "unconstitutional", "collegium"], "why": "99th Am struck down by SC — tests handling of invalidated amendments" }, # ========================================================================= # 9. INSERTED CONTENT (New articles/parts) # ========================================================================= { "id": "Q13", "category": "INSERTED_CONTENT", "query": "What new constitutional framework was created for Panchayati Raj institutions?", "expected_keywords": ["73", "Part IX", "243", "panchayat", "reservation"], "why": "73rd Am inserted entire Part IX (Art 243-243O) — tests bulk insertion" }, # ========================================================================= # 10. GST / MODERN REFORMS # ========================================================================= { "id": "Q14", "category": "MODERN_REFORM", "query": "How did the 101st Amendment establish the GST framework?", "expected_keywords": ["101", "GST", "246A", "goods and services tax", "concurrent"], "why": "101st Am added Art 246A, 269A, modified seventh schedule — tests modern changes" }, # ========================================================================= # 11. EMERGENCY PROVISIONS (Complex multi-article interaction) # ========================================================================= { "id": "Q15", "category": "EMERGENCY", "query": "How have emergency provisions under Article 352 been modified since the original Constitution?", "expected_keywords": ["352", "emergency", "armed rebellion", "internal disturbance", "44"], "why": "Art 352 modified multiple times — tests temporal + deleted content together" }, ] # ============================================================================= # TEST RUNNER # ============================================================================= @pytest.mark.asyncio async def test_quality_answers(): """Run all test queries and generate a detailed report""" results = [] passed = 0 failed = 0 for q in QUERIES: print(f"\n{'='*70}") print(f"🧪 [{q['id']}] ({q['category']}) Testing: '{q['query']}'") print(f" Why: {q['why']}") print(f"{'='*70}") start_time = datetime.now() initial_state = {"query": q['query'], "retry_count": 0} result = await graph_app.ainvoke(initial_state) elapsed = (datetime.now() - start_time).total_seconds() answer_text = result.get("draft_answer", {}).get("answer", "").lower() tree_scope = result.get("qdrant_scope", {}) trace = result.get("trace", []) confidence = result.get("critique", {}).get("final_confidence", 0) # Check graph completeness from the trace had_retry = "Validate (graph incomplete → forced retry)" in trace # Check expected keywords missing = [] found = [] for keyword in q['expected_keywords']: if keyword.lower() in answer_text: found.append(keyword) else: missing.append(keyword) keyword_score = len(found) / len(q['expected_keywords']) * 100 test_pass = keyword_score >= 60 # Pass if at least 60% keywords found if test_pass: passed += 1 status = "✅ PASS" else: failed += 1 status = "❌ FAIL" # Print results print(f"\n{status} | Keywords: {keyword_score:.0f}% | Confidence: {confidence} | Time: {elapsed:.1f}s") print(f" Found: {found}") if missing: print(f" Missing: {missing}") if had_retry: print(f" ⚠️ Graph was incomplete → forced re-retrieval") print(f" Trace: {' → '.join(trace)}") print(f"\n--- ANSWER PREVIEW (first 500 chars) ---") print(answer_text[:500]) print(f"--- END PREVIEW ---") results.append({ "id": q["id"], "category": q["category"], "query": q["query"], "status": "PASS" if test_pass else "FAIL", "keyword_score": keyword_score, "confidence": confidence, "missing": missing, "time_seconds": elapsed, "had_retry": had_retry, "trace": trace }) # ========================================================================== # SUMMARY REPORT # ========================================================================== print(f"\n\n{'='*70}") print(f"📊 TEST SUMMARY REPORT") print(f"{'='*70}") print(f"Total: {len(QUERIES)} | Passed: {passed} | Failed: {failed}") print(f"Pass Rate: {passed/len(QUERIES)*100:.0f}%") print() # Category breakdown categories = {} for r in results: cat = r["category"] if cat not in categories: categories[cat] = {"pass": 0, "fail": 0} if r["status"] == "PASS": categories[cat]["pass"] += 1 else: categories[cat]["fail"] += 1 print(f"{'Category':<25} {'Pass':<6} {'Fail':<6} {'Rate':<8}") print("-" * 45) for cat, counts in sorted(categories.items()): total = counts["pass"] + counts["fail"] rate = counts["pass"] / total * 100 print(f"{cat:<25} {counts['pass']:<6} {counts['fail']:<6} {rate:.0f}%") # Failed tests detail failures = [r for r in results if r["status"] == "FAIL"] if failures: print(f"\n❌ FAILED TESTS:") for f in failures: print(f" [{f['id']}] {f['query']}") print(f" Missing: {f['missing']}") print(f" Confidence: {f['confidence']}, Had retry: {f['had_retry']}") # Save report to file report_path = os.path.join(os.path.dirname(__file__), "test_report.json") with open(report_path, 'w') as f: json.dump({"summary": {"total": len(QUERIES), "passed": passed, "failed": failed}, "results": results}, f, indent=2) print(f"\n📄 Full report saved to: {report_path}") # ============================================================================= # QUICK TEST - Run a single query for debugging # ============================================================================= async def test_single(query_id: str): """Run a single test query by ID (e.g., 'Q1')""" q = next((q for q in QUERIES if q["id"] == query_id), None) if not q: print(f"Query {query_id} not found. Available: {[q['id'] for q in QUERIES]}") return print(f"🧪 Testing: {q['query']}") result = await graph_app.ainvoke({"query": q['query'], "retry_count": 0}) answer = result.get("draft_answer", {}).get("answer", "") print(f"\n=== ANSWER ===\n{answer}\n{'='*60}") missing = [k for k in q['expected_keywords'] if k.lower() not in answer.lower()] if missing: print(f"❌ Missing: {missing}") else: print("✅ All keywords found!") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--query", "-q", help="Run single query by ID (e.g. Q1)", default=None) args = parser.parse_args() if args.query: asyncio.run(test_single(args.query)) else: asyncio.run(test_quality_answers())