import sys
import os
import pytest
import asyncio
import json
from datetime import datetime

# Add project root to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from core.graph_workflow import app as graph_app

# =============================================================================
# COMPREHENSIVE TEST QUERIES
# =============================================================================
# Each query tests a specific edge case in the graph retrieval + reasoning pipeline
#
# Categories:
#   1. DELETED CONTENT     - Tests if removed text is properly captured
#   2. MULTI-HOP           - Tests cross-article relationship traversal
#   3. TEMPORAL EVOLUTION   - Tests tracking changes across multiple amendments
#   4. CROSS-ARTICLE        - Tests relationships between different articles
#   5. CURRENT STATUS       - Tests understanding of present-day law
#   6. COMPARISON           - Tests comparing two related provisions
#   7. SCHEDULE CHANGES     - Tests non-article constitutional elements
#   8. LANDMARK / STRUCK DOWN - Tests amendments that were invalidated
#   9. INSERTED CONTENT     - Tests new articles/parts added
#  10. FUNDAMENTAL RIGHTS   - Tests the most commonly queried area
# =============================================================================

QUERIES = [
    # =========================================================================
    # 1. DELETED CONTENT (The original failing case)
    # =========================================================================
    {
        "id": "Q1",
        "category": "DELETED_CONTENT",
        "query": "What changes did the 44th Amendment make to Article 19?",
        "expected_keywords": ["property", "armed rebellion", "internal disturbance", "19(1)(f)"],
        "why": "Tests if deleted text ('internal disturbance') is captured in graph"
    },

    # =========================================================================
    # 2. MULTI-HOP RELATIONSHIPS
    # =========================================================================
    {
        "id": "Q2",
        "category": "MULTI_HOP",
        "query": "How does Article 358 interact with Article 19 during a national emergency?",
        "expected_keywords": ["suspend", "emergency", "fundamental rights", "358", "19"],
        "why": "Tests multi-hop: Amendment → Article 358 → SUSPENDS → Article 19"
    },
    {
        "id": "Q3",
        "category": "MULTI_HOP",
        "query": "What is the relationship between Article 368 and Article 13 after the 24th Amendment?",
        "expected_keywords": ["368", "13", "amendment", "fundamental rights", "constitutional amendment"],
        "why": "Tests multi-hop: 24th Amendment modified Art 13 to exempt Art 368"
    },

    # =========================================================================
    # 3. TEMPORAL EVOLUTION (Same article changed by multiple amendments)
    # =========================================================================
    {
        "id": "Q4",
        "category": "TEMPORAL_EVOLUTION",
        "query": "How has Article 15 been modified across different amendments?",
        "expected_keywords": ["reservation", "backward classes", "15(4)", "103", "economically weaker"],
        "why": "Art 15 modified by Amendments 1, 93, 103 — tests tracking evolution"
    },
    {
        "id": "Q5",
        "category": "TEMPORAL_EVOLUTION",
        "query": "Trace the history of Right to Property from a fundamental right to a legal right",
        "expected_keywords": ["31", "44", "300A", "fundamental", "legal right", "property"],
        "why": "Art 31 (fundamental) → 25th/42nd weakened → 44th repealed → Art 300A (legal)"
    },

    # =========================================================================
    # 4. CROSS-ARTICLE EFFECTS
    # =========================================================================
    {
        "id": "Q6",
        "category": "CROSS_ARTICLE",
        "query": "What was the impact of the 42nd Amendment on both fundamental rights and directive principles?",
        "expected_keywords": ["42", "fundamental", "directive", "31C", "part IV"],
        "why": "42nd gave directive principles primacy over fundamental rights via Art 31C"
    },
    {
        "id": "Q7",
        "category": "CROSS_ARTICLE",
        "query": "How did the 7th Amendment reorganize states and affect territorial provisions?",
        "expected_keywords": ["Article 1", "reorganized states", "Union Territories", "schedule"],
        "why": "7th Am modified Art 1, 3, and First Schedule — tests broad cross-article impact"
    },

    # =========================================================================
    # 5. CURRENT STATUS QUERIES
    # =========================================================================
    {
        "id": "Q8",
        "category": "CURRENT_STATUS",
        "query": "What is the current constitutional provision for the right to education?",
        "expected_keywords": ["21A", "86", "free", "compulsory", "6 to 14"],
        "why": "86th Am inserted Art 21A — tests INSERTED content and current law"
    },
    {
        "id": "Q9",
        "category": "CURRENT_STATUS",
        "query": "What is the current status of reservation for economically weaker sections?",
        "expected_keywords": ["103", "EWS", "10%", "15(6)", "16(6)"],
        "why": "103rd Am added EWS reservation — tests newest amendments"
    },

    # =========================================================================
    # 6. COMPARISON QUERIES
    # =========================================================================
    {
        "id": "Q10",
        "category": "COMPARISON",
        "query": "Compare the changes made by the 42nd and 44th Amendments",
        "expected_keywords": ["42", "44", "emergency", "fundamental rights", "directive principles"],
        "why": "42nd expanded govt power, 44th reversed it — tests comparative reasoning"
    },

    # =========================================================================
    # 7. SCHEDULE & STRUCTURAL CHANGES
    # =========================================================================
    {
        "id": "Q11",
        "category": "SCHEDULE",
        "query": "What is the Tenth Schedule and how was it introduced?",
        "expected_keywords": ["52", "anti-defection", "tenth schedule", "disqualification"],
        "why": "52nd Am inserted 10th Schedule — tests Schedule nodes in graph"
    },

    # =========================================================================
    # 8. STRUCK DOWN / LANDMARK AMENDMENTS
    # =========================================================================
    {
        "id": "Q12",
        "category": "STRUCK_DOWN",
        "query": "What was the NJAC and why was the 99th Amendment struck down?",
        "expected_keywords": ["99", "NJAC", "judicial appointments", "unconstitutional", "collegium"],
        "why": "99th Am struck down by SC — tests handling of invalidated amendments"
    },

    # =========================================================================
    # 9. INSERTED CONTENT (New articles/parts)
    # =========================================================================
    {
        "id": "Q13",
        "category": "INSERTED_CONTENT",
        "query": "What new constitutional framework was created for Panchayati Raj institutions?",
        "expected_keywords": ["73", "Part IX", "243", "panchayat", "reservation"],
        "why": "73rd Am inserted entire Part IX (Art 243-243O) — tests bulk insertion"
    },

    # =========================================================================
    # 10. GST / MODERN REFORMS
    # =========================================================================
    {
        "id": "Q14",
        "category": "MODERN_REFORM",
        "query": "How did the 101st Amendment establish the GST framework?",
        "expected_keywords": ["101", "GST", "246A", "goods and services tax", "concurrent"],
        "why": "101st Am added Art 246A, 269A, modified seventh schedule — tests modern changes"
    },

    # =========================================================================
    # 11. EMERGENCY PROVISIONS (Complex multi-article interaction)
    # =========================================================================
    {
        "id": "Q15",
        "category": "EMERGENCY",
        "query": "How have emergency provisions under Article 352 been modified since the original Constitution?",
        "expected_keywords": ["352", "emergency", "armed rebellion", "internal disturbance", "44"],
        "why": "Art 352 modified multiple times — tests temporal + deleted content together"
    },
]

# =============================================================================
# TEST RUNNER
# =============================================================================

@pytest.mark.asyncio
async def test_quality_answers():
    """Run all test queries and generate a detailed report"""
    
    results = []
    passed = 0
    failed = 0
    
    for q in QUERIES:
        print(f"\n{'='*70}")
        print(f"🧪 [{q['id']}] ({q['category']}) Testing: '{q['query']}'")
        print(f"   Why: {q['why']}")
        print(f"{'='*70}")
        
        start_time = datetime.now()
        
        initial_state = {"query": q['query'], "retry_count": 0}
        result = await graph_app.ainvoke(initial_state)
        
        elapsed = (datetime.now() - start_time).total_seconds()
        
        answer_text = result.get("draft_answer", {}).get("answer", "").lower()
        tree_scope = result.get("qdrant_scope", {})
        trace = result.get("trace", [])
        confidence = result.get("critique", {}).get("final_confidence", 0)
        
        # Check graph completeness from the trace
        had_retry = "Validate (graph incomplete → forced retry)" in trace
        
        # Check expected keywords
        missing = []
        found = []
        for keyword in q['expected_keywords']:
            if keyword.lower() in answer_text:
                found.append(keyword)
            else:
                missing.append(keyword)
        
        keyword_score = len(found) / len(q['expected_keywords']) * 100
        test_pass = keyword_score >= 60  # Pass if at least 60% keywords found
        
        if test_pass:
            passed += 1
            status = "✅ PASS"
        else:
            failed += 1
            status = "❌ FAIL"
        
        # Print results
        print(f"\n{status} | Keywords: {keyword_score:.0f}% | Confidence: {confidence} | Time: {elapsed:.1f}s")
        print(f"   Found: {found}")
        if missing:
            print(f"   Missing: {missing}")
        if had_retry:
            print(f"   ⚠️ Graph was incomplete → forced re-retrieval")
        print(f"   Trace: {' → '.join(trace)}")
        
        print(f"\n--- ANSWER PREVIEW (first 500 chars) ---")
        print(answer_text[:500])
        print(f"--- END PREVIEW ---")
        
        results.append({
            "id": q["id"],
            "category": q["category"],
            "query": q["query"],
            "status": "PASS" if test_pass else "FAIL",
            "keyword_score": keyword_score,
            "confidence": confidence,
            "missing": missing,
            "time_seconds": elapsed,
            "had_retry": had_retry,
            "trace": trace
        })
    
    # ==========================================================================
    # SUMMARY REPORT
    # ==========================================================================
    print(f"\n\n{'='*70}")
    print(f"📊 TEST SUMMARY REPORT")
    print(f"{'='*70}")
    print(f"Total: {len(QUERIES)} | Passed: {passed} | Failed: {failed}")
    print(f"Pass Rate: {passed/len(QUERIES)*100:.0f}%")
    print()
    
    # Category breakdown
    categories = {}
    for r in results:
        cat = r["category"]
        if cat not in categories:
            categories[cat] = {"pass": 0, "fail": 0}
        if r["status"] == "PASS":
            categories[cat]["pass"] += 1
        else:
            categories[cat]["fail"] += 1
    
    print(f"{'Category':<25} {'Pass':<6} {'Fail':<6} {'Rate':<8}")
    print("-" * 45)
    for cat, counts in sorted(categories.items()):
        total = counts["pass"] + counts["fail"]
        rate = counts["pass"] / total * 100
        print(f"{cat:<25} {counts['pass']:<6} {counts['fail']:<6} {rate:.0f}%")
    
    # Failed tests detail
    failures = [r for r in results if r["status"] == "FAIL"]
    if failures:
        print(f"\n❌ FAILED TESTS:")
        for f in failures:
            print(f"  [{f['id']}] {f['query']}")
            print(f"       Missing: {f['missing']}")
            print(f"       Confidence: {f['confidence']}, Had retry: {f['had_retry']}")
    
    # Save report to file
    report_path = os.path.join(os.path.dirname(__file__), "test_report.json")
    with open(report_path, 'w') as f:
        json.dump({"summary": {"total": len(QUERIES), "passed": passed, "failed": failed}, "results": results}, f, indent=2)
    print(f"\n📄 Full report saved to: {report_path}")

# =============================================================================
# QUICK TEST - Run a single query for debugging
# =============================================================================
async def test_single(query_id: str):
    """Run a single test query by ID (e.g., 'Q1')"""
    q = next((q for q in QUERIES if q["id"] == query_id), None)
    if not q:
        print(f"Query {query_id} not found. Available: {[q['id'] for q in QUERIES]}")
        return
    
    print(f"🧪 Testing: {q['query']}")
    result = await graph_app.ainvoke({"query": q['query'], "retry_count": 0})
    
    answer = result.get("draft_answer", {}).get("answer", "")
    print(f"\n=== ANSWER ===\n{answer}\n{'='*60}")
    
    missing = [k for k in q['expected_keywords'] if k.lower() not in answer.lower()]
    if missing:
        print(f"❌ Missing: {missing}")
    else:
        print("✅ All keywords found!")

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--query", "-q", help="Run single query by ID (e.g. Q1)", default=None)
    args = parser.parse_args()
    
    if args.query:
        asyncio.run(test_single(args.query))
    else:
        asyncio.run(test_quality_answers())