Spaces:
Sleeping
Sleeping
| import sys | |
| import os | |
| import pytest | |
| import asyncio | |
| import json | |
| from datetime import datetime | |
| # Add project root to path | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from core.graph_workflow import app as graph_app | |
| # ============================================================================= | |
| # COMPREHENSIVE TEST QUERIES | |
| # ============================================================================= | |
| # Each query tests a specific edge case in the graph retrieval + reasoning pipeline | |
| # | |
| # Categories: | |
| # 1. DELETED CONTENT - Tests if removed text is properly captured | |
| # 2. MULTI-HOP - Tests cross-article relationship traversal | |
| # 3. TEMPORAL EVOLUTION - Tests tracking changes across multiple amendments | |
| # 4. CROSS-ARTICLE - Tests relationships between different articles | |
| # 5. CURRENT STATUS - Tests understanding of present-day law | |
| # 6. COMPARISON - Tests comparing two related provisions | |
| # 7. SCHEDULE CHANGES - Tests non-article constitutional elements | |
| # 8. LANDMARK / STRUCK DOWN - Tests amendments that were invalidated | |
| # 9. INSERTED CONTENT - Tests new articles/parts added | |
| # 10. FUNDAMENTAL RIGHTS - Tests the most commonly queried area | |
| # ============================================================================= | |
| QUERIES = [ | |
| # ========================================================================= | |
| # 1. DELETED CONTENT (The original failing case) | |
| # ========================================================================= | |
| { | |
| "id": "Q1", | |
| "category": "DELETED_CONTENT", | |
| "query": "What changes did the 44th Amendment make to Article 19?", | |
| "expected_keywords": ["property", "armed rebellion", "internal disturbance", "19(1)(f)"], | |
| "why": "Tests if deleted text ('internal disturbance') is captured in graph" | |
| }, | |
| # ========================================================================= | |
| # 2. MULTI-HOP RELATIONSHIPS | |
| # ========================================================================= | |
| { | |
| "id": "Q2", | |
| "category": "MULTI_HOP", | |
| "query": "How does Article 358 interact with Article 19 during a national emergency?", | |
| "expected_keywords": ["suspend", "emergency", "fundamental rights", "358", "19"], | |
| "why": "Tests multi-hop: Amendment β Article 358 β SUSPENDS β Article 19" | |
| }, | |
| { | |
| "id": "Q3", | |
| "category": "MULTI_HOP", | |
| "query": "What is the relationship between Article 368 and Article 13 after the 24th Amendment?", | |
| "expected_keywords": ["368", "13", "amendment", "fundamental rights", "constitutional amendment"], | |
| "why": "Tests multi-hop: 24th Amendment modified Art 13 to exempt Art 368" | |
| }, | |
| # ========================================================================= | |
| # 3. TEMPORAL EVOLUTION (Same article changed by multiple amendments) | |
| # ========================================================================= | |
| { | |
| "id": "Q4", | |
| "category": "TEMPORAL_EVOLUTION", | |
| "query": "How has Article 15 been modified across different amendments?", | |
| "expected_keywords": ["reservation", "backward classes", "15(4)", "103", "economically weaker"], | |
| "why": "Art 15 modified by Amendments 1, 93, 103 β tests tracking evolution" | |
| }, | |
| { | |
| "id": "Q5", | |
| "category": "TEMPORAL_EVOLUTION", | |
| "query": "Trace the history of Right to Property from a fundamental right to a legal right", | |
| "expected_keywords": ["31", "44", "300A", "fundamental", "legal right", "property"], | |
| "why": "Art 31 (fundamental) β 25th/42nd weakened β 44th repealed β Art 300A (legal)" | |
| }, | |
| # ========================================================================= | |
| # 4. CROSS-ARTICLE EFFECTS | |
| # ========================================================================= | |
| { | |
| "id": "Q6", | |
| "category": "CROSS_ARTICLE", | |
| "query": "What was the impact of the 42nd Amendment on both fundamental rights and directive principles?", | |
| "expected_keywords": ["42", "fundamental", "directive", "31C", "part IV"], | |
| "why": "42nd gave directive principles primacy over fundamental rights via Art 31C" | |
| }, | |
| { | |
| "id": "Q7", | |
| "category": "CROSS_ARTICLE", | |
| "query": "How did the 7th Amendment reorganize states and affect territorial provisions?", | |
| "expected_keywords": ["Article 1", "reorganized states", "Union Territories", "schedule"], | |
| "why": "7th Am modified Art 1, 3, and First Schedule β tests broad cross-article impact" | |
| }, | |
| # ========================================================================= | |
| # 5. CURRENT STATUS QUERIES | |
| # ========================================================================= | |
| { | |
| "id": "Q8", | |
| "category": "CURRENT_STATUS", | |
| "query": "What is the current constitutional provision for the right to education?", | |
| "expected_keywords": ["21A", "86", "free", "compulsory", "6 to 14"], | |
| "why": "86th Am inserted Art 21A β tests INSERTED content and current law" | |
| }, | |
| { | |
| "id": "Q9", | |
| "category": "CURRENT_STATUS", | |
| "query": "What is the current status of reservation for economically weaker sections?", | |
| "expected_keywords": ["103", "EWS", "10%", "15(6)", "16(6)"], | |
| "why": "103rd Am added EWS reservation β tests newest amendments" | |
| }, | |
| # ========================================================================= | |
| # 6. COMPARISON QUERIES | |
| # ========================================================================= | |
| { | |
| "id": "Q10", | |
| "category": "COMPARISON", | |
| "query": "Compare the changes made by the 42nd and 44th Amendments", | |
| "expected_keywords": ["42", "44", "emergency", "fundamental rights", "directive principles"], | |
| "why": "42nd expanded govt power, 44th reversed it β tests comparative reasoning" | |
| }, | |
| # ========================================================================= | |
| # 7. SCHEDULE & STRUCTURAL CHANGES | |
| # ========================================================================= | |
| { | |
| "id": "Q11", | |
| "category": "SCHEDULE", | |
| "query": "What is the Tenth Schedule and how was it introduced?", | |
| "expected_keywords": ["52", "anti-defection", "tenth schedule", "disqualification"], | |
| "why": "52nd Am inserted 10th Schedule β tests Schedule nodes in graph" | |
| }, | |
| # ========================================================================= | |
| # 8. STRUCK DOWN / LANDMARK AMENDMENTS | |
| # ========================================================================= | |
| { | |
| "id": "Q12", | |
| "category": "STRUCK_DOWN", | |
| "query": "What was the NJAC and why was the 99th Amendment struck down?", | |
| "expected_keywords": ["99", "NJAC", "judicial appointments", "unconstitutional", "collegium"], | |
| "why": "99th Am struck down by SC β tests handling of invalidated amendments" | |
| }, | |
| # ========================================================================= | |
| # 9. INSERTED CONTENT (New articles/parts) | |
| # ========================================================================= | |
| { | |
| "id": "Q13", | |
| "category": "INSERTED_CONTENT", | |
| "query": "What new constitutional framework was created for Panchayati Raj institutions?", | |
| "expected_keywords": ["73", "Part IX", "243", "panchayat", "reservation"], | |
| "why": "73rd Am inserted entire Part IX (Art 243-243O) β tests bulk insertion" | |
| }, | |
| # ========================================================================= | |
| # 10. GST / MODERN REFORMS | |
| # ========================================================================= | |
| { | |
| "id": "Q14", | |
| "category": "MODERN_REFORM", | |
| "query": "How did the 101st Amendment establish the GST framework?", | |
| "expected_keywords": ["101", "GST", "246A", "goods and services tax", "concurrent"], | |
| "why": "101st Am added Art 246A, 269A, modified seventh schedule β tests modern changes" | |
| }, | |
| # ========================================================================= | |
| # 11. EMERGENCY PROVISIONS (Complex multi-article interaction) | |
| # ========================================================================= | |
| { | |
| "id": "Q15", | |
| "category": "EMERGENCY", | |
| "query": "How have emergency provisions under Article 352 been modified since the original Constitution?", | |
| "expected_keywords": ["352", "emergency", "armed rebellion", "internal disturbance", "44"], | |
| "why": "Art 352 modified multiple times β tests temporal + deleted content together" | |
| }, | |
| ] | |
| # ============================================================================= | |
| # TEST RUNNER | |
| # ============================================================================= | |
| async def test_quality_answers(): | |
| """Run all test queries and generate a detailed report""" | |
| results = [] | |
| passed = 0 | |
| failed = 0 | |
| for q in QUERIES: | |
| print(f"\n{'='*70}") | |
| print(f"π§ͺ [{q['id']}] ({q['category']}) Testing: '{q['query']}'") | |
| print(f" Why: {q['why']}") | |
| print(f"{'='*70}") | |
| start_time = datetime.now() | |
| initial_state = {"query": q['query'], "retry_count": 0} | |
| result = await graph_app.ainvoke(initial_state) | |
| elapsed = (datetime.now() - start_time).total_seconds() | |
| answer_text = result.get("draft_answer", {}).get("answer", "").lower() | |
| tree_scope = result.get("qdrant_scope", {}) | |
| trace = result.get("trace", []) | |
| confidence = result.get("critique", {}).get("final_confidence", 0) | |
| # Check graph completeness from the trace | |
| had_retry = "Validate (graph incomplete β forced retry)" in trace | |
| # Check expected keywords | |
| missing = [] | |
| found = [] | |
| for keyword in q['expected_keywords']: | |
| if keyword.lower() in answer_text: | |
| found.append(keyword) | |
| else: | |
| missing.append(keyword) | |
| keyword_score = len(found) / len(q['expected_keywords']) * 100 | |
| test_pass = keyword_score >= 60 # Pass if at least 60% keywords found | |
| if test_pass: | |
| passed += 1 | |
| status = "β PASS" | |
| else: | |
| failed += 1 | |
| status = "β FAIL" | |
| # Print results | |
| print(f"\n{status} | Keywords: {keyword_score:.0f}% | Confidence: {confidence} | Time: {elapsed:.1f}s") | |
| print(f" Found: {found}") | |
| if missing: | |
| print(f" Missing: {missing}") | |
| if had_retry: | |
| print(f" β οΈ Graph was incomplete β forced re-retrieval") | |
| print(f" Trace: {' β '.join(trace)}") | |
| print(f"\n--- ANSWER PREVIEW (first 500 chars) ---") | |
| print(answer_text[:500]) | |
| print(f"--- END PREVIEW ---") | |
| results.append({ | |
| "id": q["id"], | |
| "category": q["category"], | |
| "query": q["query"], | |
| "status": "PASS" if test_pass else "FAIL", | |
| "keyword_score": keyword_score, | |
| "confidence": confidence, | |
| "missing": missing, | |
| "time_seconds": elapsed, | |
| "had_retry": had_retry, | |
| "trace": trace | |
| }) | |
| # ========================================================================== | |
| # SUMMARY REPORT | |
| # ========================================================================== | |
| print(f"\n\n{'='*70}") | |
| print(f"π TEST SUMMARY REPORT") | |
| print(f"{'='*70}") | |
| print(f"Total: {len(QUERIES)} | Passed: {passed} | Failed: {failed}") | |
| print(f"Pass Rate: {passed/len(QUERIES)*100:.0f}%") | |
| print() | |
| # Category breakdown | |
| categories = {} | |
| for r in results: | |
| cat = r["category"] | |
| if cat not in categories: | |
| categories[cat] = {"pass": 0, "fail": 0} | |
| if r["status"] == "PASS": | |
| categories[cat]["pass"] += 1 | |
| else: | |
| categories[cat]["fail"] += 1 | |
| print(f"{'Category':<25} {'Pass':<6} {'Fail':<6} {'Rate':<8}") | |
| print("-" * 45) | |
| for cat, counts in sorted(categories.items()): | |
| total = counts["pass"] + counts["fail"] | |
| rate = counts["pass"] / total * 100 | |
| print(f"{cat:<25} {counts['pass']:<6} {counts['fail']:<6} {rate:.0f}%") | |
| # Failed tests detail | |
| failures = [r for r in results if r["status"] == "FAIL"] | |
| if failures: | |
| print(f"\nβ FAILED TESTS:") | |
| for f in failures: | |
| print(f" [{f['id']}] {f['query']}") | |
| print(f" Missing: {f['missing']}") | |
| print(f" Confidence: {f['confidence']}, Had retry: {f['had_retry']}") | |
| # Save report to file | |
| report_path = os.path.join(os.path.dirname(__file__), "test_report.json") | |
| with open(report_path, 'w') as f: | |
| json.dump({"summary": {"total": len(QUERIES), "passed": passed, "failed": failed}, "results": results}, f, indent=2) | |
| print(f"\nπ Full report saved to: {report_path}") | |
| # ============================================================================= | |
| # QUICK TEST - Run a single query for debugging | |
| # ============================================================================= | |
| async def test_single(query_id: str): | |
| """Run a single test query by ID (e.g., 'Q1')""" | |
| q = next((q for q in QUERIES if q["id"] == query_id), None) | |
| if not q: | |
| print(f"Query {query_id} not found. Available: {[q['id'] for q in QUERIES]}") | |
| return | |
| print(f"π§ͺ Testing: {q['query']}") | |
| result = await graph_app.ainvoke({"query": q['query'], "retry_count": 0}) | |
| answer = result.get("draft_answer", {}).get("answer", "") | |
| print(f"\n=== ANSWER ===\n{answer}\n{'='*60}") | |
| missing = [k for k in q['expected_keywords'] if k.lower() not in answer.lower()] | |
| if missing: | |
| print(f"β Missing: {missing}") | |
| else: | |
| print("β All keywords found!") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--query", "-q", help="Run single query by ID (e.g. Q1)", default=None) | |
| args = parser.parse_args() | |
| if args.query: | |
| asyncio.run(test_single(args.query)) | |
| else: | |
| asyncio.run(test_quality_answers()) | |