"""
RAG Pipeline Evaluation (v2)
------------------------------
Comprehensive tests covering:
- Query classification accuracy
- Retrieval precision and recall
- Guardrail validation (on-topic / off-topic detection)
- End-to-end response quality (requires GROQ_API_KEY)

Usage:
    python evaluate.py                    # classification + retrieval + guardrail tests
    GROQ_API_KEY=xxx python evaluate.py   # all tests including end-to-end
"""

import os
import sys
import json
from pathlib import Path
from datetime import datetime

sys.path.insert(0, str(Path(__file__).parent))

from src.ingest import load_knowledge_base, build_documents, get_embeddings, build_vector_store, load_vector_store
from src.retriever import HybridRetriever, classify_query
from src.chain import check_guardrails


# ---------------------------------------------------------------------------
# TEST CASES
# ---------------------------------------------------------------------------

RETRIEVAL_TESTS = [
    {
        "query": "What opportunities are available for investigative journalists in Africa?",
        "expected_ids": ["opp-002", "opp-017"],
        "description": "Region + topic filter — Africa investigative",
    },
    {
        "query": "Find fellowships with deadlines in the next 30 days",
        "expected_type": "fellowship",
        "description": "Deadline + type filter — fellowships",
    },
    {
        "query": "What resources does IJNet have on AI tools for journalists?",
        "expected_ids": ["art-001", "opp-007", "opp-020"],
        "description": "Topic search — AI tools",
    },
    {
        "query": "Can you summarize the latest opportunities for product/design people in newsrooms?",
        "expected_ids": ["art-003", "opp-016", "opp-015"],
        "description": "Product/design role search",
    },
    {
        "query": "Which IJNet newsletter should I subscribe to?",
        "expected_ids": ["art-002"],
        "description": "Newsletter-specific query",
    },
    {
        "query": "What grants are available for data journalism?",
        "expected_ids": ["opp-005", "opp-013"],
        "description": "Grant type + data journalism topic",
    },
    {
        "query": "Tell me about digital security for journalists",
        "expected_ids": ["art-004"],
        "description": "Article retrieval — digital security",
    },
    {
        "query": "What training programs exist for journalists in the Middle East?",
        "expected_ids": ["opp-007", "opp-012"],
        "description": "Region filter — MENA",
    },
    {
        "query": "Climate change reporting opportunities",
        "expected_ids": ["opp-004", "opp-008"],
        "description": "Topic — environment/climate",
    },
    {
        "query": "What is IJNet?",
        "expected_ids": ["ijnet-about"],
        "description": "About IJNet query",
    },
    {
        "query": "Opportunities for women journalists in Africa",
        "expected_ids": ["opp-011"],
        "description": "Women + Africa filter",
    },
    {
        "query": "How can freelance journalists find funding?",
        "expected_ids": ["art-006"],
        "description": "Freelance funding article",
    },
    {
        "query": "fact-checking training workshops",
        "expected_ids": ["opp-017"],
        "description": "Fact-checking topic",
    },
    {
        "query": "press freedom fellowships",
        "expected_ids": ["opp-019"],
        "description": "Press freedom topic",
    },
    {
        "query": "mobile journalism webinar",
        "expected_ids": ["opp-012"],
        "description": "MoJo / mobile journalism",
    },
]

CLASSIFICATION_TESTS = [
    ("Find fellowships with deadlines in the next 30 days", "deadline_search", {"deadline_days": 30}),
    ("Opportunities for journalists in Africa", "region_search", {}),
    ("Which newsletter should I subscribe to?", "newsletter", {}),
    ("What is IJNet?", "about", {}),
    ("AI tools for newsrooms", "general", {}),
    ("Grants expiring within 60 days", "deadline_search", {"deadline_days": 60}),
    ("Training programs in the Middle East", "region_search", {}),
    ("Fellowships closing in the next 2 weeks", "deadline_search", {"deadline_days": 14}),
    ("Data journalism awards", "general", {}),
    ("What opportunities are there in South Asia?", "region_search", {}),
]

GUARDRAIL_TESTS = [
    # (query, should_be_allowed)
    ("What fellowships are available for African journalists?", True),
    ("Tell me about AI tools for newsrooms", True),
    ("Which IJNet newsletter should I subscribe to?", True),
    ("Hello", True),
    ("Thanks for the help!", True),
    ("What grants exist?", True),
    ("help", True),
    # Off-topic queries
    ("Write me a poem about the moon", False),
    ("What's the weather in New York?", False),
    ("How do I cook pasta carbonara?", False),
    ("Solve this math equation: 2x + 5 = 15", False),
    ("Translate this to French: hello world", False),
    ("Tell me a joke", False),
    # Edge cases — should still be allowed (journalism-adjacent)
    ("How can journalists use AI?", True),
    ("media training opportunities", True),
    ("press freedom in Asia", True),
]

E2E_TESTS = [
    {
        "query": "What opportunities are available for investigative journalists in Africa?",
        "must_contain": ["Africa", "investigat"],
        "must_not_contain": ["I don't have information"],
        "description": "Should find African investigative opportunities",
    },
    {
        "query": "Which IJNet newsletter should I subscribe to?",
        "must_contain": ["newsletter", "subscribe"],
        "must_not_contain": ["I don't have information"],
        "description": "Should describe newsletter options",
    },
    {
        "query": "Write me a poem about the ocean",
        "must_contain": ["journalism", "IJNet"],
        "must_not_contain": ["ocean", "poem", "sea"],
        "description": "Should reject off-topic and redirect",
    },
    {
        "query": "What AI tools can journalists use?",
        "must_contain": ["AI", "tool"],
        "must_not_contain": ["I don't have information"],
        "description": "Should discuss AI tools from the article",
    },
    {
        "query": "Are there any grants for data journalism?",
        "must_contain": ["data journalism", "grant"],
        "must_not_contain": [],
        "description": "Should find data journalism grants",
    },
]


# ---------------------------------------------------------------------------
# TEST RUNNERS
# ---------------------------------------------------------------------------

def run_classification_tests():
    """Test query classification accuracy."""
    print("\n" + "=" * 60)
    print("1. QUERY CLASSIFICATION TESTS")
    print("=" * 60)
    
    passed = 0
    for query, expected_intent, expected_filters in CLASSIFICATION_TESTS:
        result = classify_query(query)
        intent_match = result["intent"] == expected_intent
        
        filter_match = True
        for key, val in expected_filters.items():
            if result["filters"].get(key) != val:
                filter_match = False
        
        status = "✅" if (intent_match and filter_match) else "❌"
        print(f"  {status} \"{query[:55]}...\"" if len(query) > 55 else f"  {status} \"{query}\"")
        if not (intent_match and filter_match):
            print(f"      Expected: {expected_intent}, Got: {result['intent']}")
        
        if intent_match and filter_match:
            passed += 1
    
    total = len(CLASSIFICATION_TESTS)
    print(f"\n  Result: {passed}/{total} passed ({passed/total:.0%})")
    return passed, total


def run_guardrail_tests():
    """Test guardrail accuracy."""
    print("\n" + "=" * 60)
    print("2. GUARDRAIL TESTS")
    print("=" * 60)
    
    passed = 0
    for query, should_allow in GUARDRAIL_TESTS:
        is_allowed, msg = check_guardrails(query)
        correct = is_allowed == should_allow
        
        status = "✅" if correct else "❌"
        expected = "allow" if should_allow else "block"
        actual = "allowed" if is_allowed else "blocked"
        print(f"  {status} [{expected}] \"{query[:50]}\" → {actual}")
        
        if correct:
            passed += 1
    
    total = len(GUARDRAIL_TESTS)
    print(f"\n  Result: {passed}/{total} passed ({passed/total:.0%})")
    return passed, total


def run_retrieval_tests(retriever: HybridRetriever):
    """Test retrieval accuracy."""
    print("\n" + "=" * 60)
    print("3. RETRIEVAL TESTS")
    print("=" * 60)
    
    passed = 0
    total = len(RETRIEVAL_TESTS)
    
    for i, test in enumerate(RETRIEVAL_TESTS, 1):
        query = test["query"]
        expected_ids = test.get("expected_ids", [])
        expected_type = test.get("expected_type", None)
        
        results = retriever.retrieve(query)
        retrieved_ids = [doc.metadata.get("doc_id", "") for doc in results]
        
        test_passed = True
        
        if expected_ids:
            found = [eid for eid in expected_ids if eid in retrieved_ids]
            recall = len(found) / len(expected_ids)
            if recall < 0.5:
                test_passed = False
        
        status = "✅" if test_passed else "❌"
        print(f"  {status} {test['description']}")
        if not test_passed:
            print(f"      Expected: {expected_ids}, Got: {retrieved_ids}")
        
        if test_passed:
            passed += 1
    
    print(f"\n  Result: {passed}/{total} passed ({passed/total:.0%})")
    return passed, total


def run_e2e_tests(retriever: HybridRetriever):
    """Test full end-to-end response quality. Requires GROQ_API_KEY."""
    api_key = os.environ.get("GROQ_API_KEY")
    if not api_key:
        print("\n" + "=" * 60)
        print("4. END-TO-END TESTS (SKIPPED — set GROQ_API_KEY to run)")
        print("=" * 60)
        return 0, 0
    
    print("\n" + "=" * 60)
    print("4. END-TO-END TESTS")
    print("=" * 60)
    
    from src.chain import IJNetRAGChain
    chain = IJNetRAGChain(retriever=retriever, groq_api_key=api_key)
    
    passed = 0
    for test in E2E_TESTS:
        query = test["query"]
        
        try:
            result = chain.query(query)
            answer = result["answer"].lower()
            
            # Check must_contain
            contains_ok = all(
                term.lower() in answer for term in test["must_contain"]
            )
            
            # Check must_not_contain
            not_contains_ok = all(
                term.lower() not in answer for term in test["must_not_contain"]
            )
            
            test_passed = contains_ok and not_contains_ok
            status = "✅" if test_passed else "❌"
            print(f"  {status} {test['description']}")
            
            if not test_passed:
                if not contains_ok:
                    missing = [t for t in test["must_contain"] if t.lower() not in answer]
                    print(f"      Missing terms: {missing}")
                if not not_contains_ok:
                    found_bad = [t for t in test["must_not_contain"] if t.lower() in answer]
                    print(f"      Unwanted terms found: {found_bad}")
                print(f"      Response preview: {answer[:150]}...")
            
            if test_passed:
                passed += 1
                
        except Exception as e:
            print(f"  ❌ {test['description']} — Error: {e}")
    
    total = len(E2E_TESTS)
    print(f"\n  Result: {passed}/{total} passed ({passed/total:.0%})")
    return passed, total


# ---------------------------------------------------------------------------
# MAIN
# ---------------------------------------------------------------------------

def main():
    print("=" * 60)
    print("IJNet RAG Pipeline — Evaluation Suite (v2)")
    print("=" * 60)
    
    # Initialize
    print("\nInitializing pipeline...")
    kb = load_knowledge_base("data/knowledge_base.json")
    documents = build_documents(kb)
    embeddings = get_embeddings()
    
    index_path = "data/faiss_index"
    if Path(index_path).exists():
        vector_store = load_vector_store(index_path, embeddings)
    else:
        vector_store = build_vector_store(documents, embeddings, index_path)
    
    retriever = HybridRetriever(
        vector_store=vector_store,
        documents=documents,
        semantic_k=8,
        bm25_k=8,
        final_k=5,
    )
    print(f"Pipeline ready. {len(documents)} documents indexed.")
    
    # Run all test suites
    results = []
    results.append(("Classification", *run_classification_tests()))
    results.append(("Guardrails", *run_guardrail_tests()))
    results.append(("Retrieval", *run_retrieval_tests(retriever)))
    results.append(("End-to-End", *run_e2e_tests(retriever)))
    
    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    
    total_passed = 0
    total_tests = 0
    for name, passed, total in results:
        if total > 0:
            print(f"  {name:20s}: {passed}/{total} ({passed/total:.0%})")
            total_passed += passed
            total_tests += total
    
    if total_tests > 0:
        print(f"  {'OVERALL':20s}: {total_passed}/{total_tests} ({total_passed/total_tests:.0%})")
    print("=" * 60)


if __name__ == "__main__":
    main()