Spaces:

ChiragPatankar
/

RAG_backend

Sleeping

App Files Files Community

ChiragPatankar commited on Jan 22

Commit

2ed3c40

verified ·

1 Parent(s): 16001d7

Upload scripts/validate_rag.py with huggingface_hub

Browse files

Files changed (1) hide show

scripts/validate_rag.py +466 -0

scripts/validate_rag.py ADDED Viewed

	@@ -0,0 +1,466 @@

+"""
+Automated RAG pipeline validation script.
+Tests end-to-end functionality, multi-tenant isolation, and anti-hallucination.
+"""
+import httpx
+import time
+import json
+from pathlib import Path
+from typing import Dict, List, Any, Tuple
+import sys
+# Add parent directory to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+BASE_URL = "http://localhost:8000"
+TEST_TENANT_A = "tenant_A"
+TEST_TENANT_B = "tenant_B"
+TEST_USER_A = "user_A"
+TEST_USER_B = "user_B"
+TEST_KB_A = "kb_A"
+TEST_KB_B = "kb_B"
+# Test documents
+TENANT_A_DOC = Path(__file__).parent.parent / "data" / "test_docs" / "tenant_A_kb.md"
+TENANT_B_DOC = Path(__file__).parent.parent / "data" / "test_docs" / "tenant_B_kb.md"
+# Test results storage
+test_results: List[Dict[str, Any]] = []
+def print_header(text: str):
+    """Print a formatted header."""
+    print("\n" + "=" * 80)
+    print(f"  {text}")
+    print("=" * 80)
+def print_test(test_name: str, passed: bool, reason: str = ""):
+    """Print test result."""
+    status = "[PASS]" if passed else "[FAIL]"
+    print(f"{status} | {test_name}")
+    if reason:
+        print(f"      └─ {reason}")
+    test_results.append({
+        "test": test_name,
+        "passed": passed,
+        "reason": reason
+    })
+def wait_for_server(max_retries: int = 10, delay: int = 2) -> bool:
+    """Wait for the server to be ready."""
+    print("Waiting for server to be ready...")
+    for i in range(max_retries):
+        try:
+            response = httpx.get(f"{BASE_URL}/health", timeout=5)
+            if response.status_code == 200:
+                print("[OK] Server is ready")
+                return True
+        except Exception:
+            pass
+        time.sleep(delay)
+        print(f"  Retry {i+1}/{max_retries}...")
+    print("[FAIL] Server not ready after max retries")
+    return False
+def upload_document(
+    client: httpx.Client,
+    file_path: Path,
+    tenant_id: str,
+    user_id: str,
+    kb_id: str
+) -> Dict[str, Any]:
+    """Upload a document to the knowledge base."""
+    try:
+        with open(file_path, "rb") as f:
+            files = {"file": (file_path.name, f, "text/markdown")}
+            data = {
+                "tenant_id": tenant_id,
+                "user_id": user_id,
+                "kb_id": kb_id
+            }
+            response = client.post(
+                f"{BASE_URL}/kb/upload",
+                files=files,
+                data=data,
+                timeout=60
+            )
+            if response.status_code == 200:
+                return {"success": True, "data": response.json()}
+            else:
+                return {"success": False, "error": response.text}
+    except Exception as e:
+        return {"success": False, "error": str(e)}
+def test_retrieval(
+    client: httpx.Client,
+    query: str,
+    tenant_id: str,
+    user_id: str,
+    kb_id: str,
+    expected_keywords: List[str],
+    should_not_contain: List[str] = None,
+    top_k: int = 5
+) -> Tuple[bool, str]:
+    """Test retrieval accuracy."""
+    try:
+        # Use GET for search endpoint with headers for dev mode auth
+        headers = {
+            "X-Tenant-Id": tenant_id,
+            "X-User-Id": user_id
+        }
+        response = client.get(
+            f"{BASE_URL}/kb/search",
+            params={
+                "query": query,
+                "kb_id": kb_id,
+                "top_k": top_k
+            },
+            headers=headers,
+            timeout=30
+        )
+        if response.status_code != 200:
+            return False, f"API returned {response.status_code}: {response.text}"
+        data = response.json()
+        results = data.get("results", [])
+        if not results:
+            return False, "No results retrieved"
+        # Check tenant isolation
+        for result in results:
+            metadata = result.get("metadata", {})
+            result_tenant = metadata.get("tenant_id")
+            if result_tenant != tenant_id:
+                return False, f"Tenant leak detected! Got tenant_id={result_tenant}, expected {tenant_id}"
+        # Check for expected keywords
+        all_content = " ".join([r.get("content", "") for r in results]).lower()
+        found_keywords = [kw for kw in expected_keywords if kw.lower() in all_content]
+        if not found_keywords:
+            return False, f"Expected keywords not found: {expected_keywords}"
+        # Check for forbidden content
+        if should_not_contain:
+            for forbidden in should_not_contain:
+                if forbidden.lower() in all_content:
+                    return False, f"Forbidden content found: {forbidden}"
+        return True, f"Retrieved {len(results)} results, found keywords: {found_keywords}"
+    except Exception as e:
+        return False, f"Error: {str(e)}"
+def test_chat(
+    client: httpx.Client,
+    question: str,
+    tenant_id: str,
+    user_id: str,
+    kb_id: str,
+    expected_keywords: List[str] = None,
+    should_refuse: bool = False,
+    should_not_contain: List[str] = None
+) -> Tuple[bool, str, Dict[str, Any]]:
+    """Test full chat endpoint."""
+    try:
+        # Include headers for dev mode auth
+        headers = {
+            "X-Tenant-Id": tenant_id,
+            "X-User-Id": user_id
+        }
+        response = client.post(
+            f"{BASE_URL}/chat",
+            json={
+                "tenant_id": tenant_id,
+                "user_id": user_id,
+                "kb_id": kb_id,
+                "question": question
+            },
+            headers=headers,
+            timeout=60
+        )
+        if response.status_code != 200:
+            return False, f"API returned {response.status_code}: {response.text}", {}
+        data = response.json()
+        answer = data.get("answer", "").lower()
+        citations = data.get("citations", [])
+        from_kb = data.get("from_knowledge_base", False)
+        confidence = data.get("confidence", 0.0)
+        metadata = data.get("metadata", {})
+        refused = metadata.get("refused", False)
+        # Check refusal behavior (STRICT)
+        if should_refuse:
+            # Check if response explicitly indicates refusal
+            refused = data.get("refused", False)
+            refusal_keywords = [
+                "couldn't find", "don't have", "not available", "contact support",
+                "not in the knowledge base", "could not verify", "not enough information",
+                "apologize", "couldn't find relevant information"
+            ]
+            has_refusal_keywords = any(kw in answer for kw in refusal_keywords)
+            # If answer was generated with citations, it's a FAIL (should have refused)
+            if citations and len(citations) > 0:
+                return False, (
+                    f"Should have refused but generated answer with {len(citations)} citations. "
+                    f"Answer: {answer[:300]}"
+                ), data
+            # If confidence is high and answer exists, it's a FAIL
+            if confidence >= 0.30 and answer and not has_refusal_keywords:
+                return False, (
+                    f"Should have refused but generated answer with confidence {confidence:.2f}. "
+                    f"Answer: {answer[:300]}"
+                ), data
+            # If not refused and no refusal keywords, it's a FAIL
+            if not refused and not has_refusal_keywords:
+                return False, (
+                    f"Should have refused but didn't. "
+                    f"refused={refused}, confidence={confidence:.2f}, citations={len(citations)}. "
+                    f"Answer: {answer[:300]}"
+                ), data
+            # If we got here, it properly refused
+            return True, f"Properly refused (refused={refused}, confidence={confidence:.2f})", data
+        # Check for expected keywords
+        if expected_keywords:
+            found = [kw for kw in expected_keywords if kw.lower() in answer]
+            if not found:
+                return False, f"Expected keywords not found: {expected_keywords}. Answer: {answer[:200]}", data
+        # Check citations
+        if not should_refuse and from_kb:
+            if not citations:
+                return False, "Answer claims to be from KB but has no citations", data
+        # Check for forbidden content
+        if should_not_contain:
+            for forbidden in should_not_contain:
+                if forbidden.lower() in answer:
+                    return False, f"Forbidden content found in answer: {forbidden}", data
+        # Check citation integrity
+        if citations and expected_keywords:
+            citation_text = " ".join([c.get("excerpt", "") for c in citations]).lower()
+            for kw in expected_keywords:
+                if kw.lower() in answer and kw.lower() not in citation_text:
+                    # This is a warning, not a failure
+                    pass
+        return True, f"Answer generated (confidence: {confidence:.2f}, citations: {len(citations)})", data
+    except Exception as e:
+        return False, f"Error: {str(e)}", {}
+def main():
+    """Run all validation tests."""
+    print_header("RAG Pipeline Validation Suite")
+    # Check server
+    if not wait_for_server():
+        print("[FAIL] Cannot proceed without server")
+        return
+    client = httpx.Client(timeout=120.0)
+    # ========== PHASE 1: Upload Documents ==========
+    print_header("Phase 1: Upload Test Documents")
+    # Upload tenant A doc
+    print(f"\n📤 Uploading {TENANT_A_DOC.name} for {TEST_TENANT_A}...")
+    result = upload_document(client, TENANT_A_DOC, TEST_TENANT_A, TEST_USER_A, TEST_KB_A)
+    if result["success"]:
+        print("[OK] Upload successful")
+        print("⏳ Waiting for document processing (10 seconds)...")
+        time.sleep(10)  # Wait longer for processing (parsing, chunking, embedding)
+    else:
+        print(f"[FAIL] Upload failed: {result.get('error')}")
+        return
+    # Upload tenant B doc
+    print(f"\n📤 Uploading {TENANT_B_DOC.name} for {TEST_TENANT_B}...")
+    result = upload_document(client, TENANT_B_DOC, TEST_TENANT_B, TEST_USER_B, TEST_KB_B)
+    if result["success"]:
+        print("[OK] Upload successful")
+        print("⏳ Waiting for document processing (10 seconds)...")
+        time.sleep(10)  # Wait longer for processing (parsing, chunking, embedding)
+    else:
+        print(f"[FAIL] Upload failed: {result.get('error')}")
+        return
+    # ========== PHASE 2: Retrieval Tests ==========
+    print_header("Phase 2: Retrieval Accuracy Tests")
+    # Test 1: Tenant A retrieval
+    passed, reason = test_retrieval(
+        client,
+        "What is the refund window?",
+        TEST_TENANT_A,
+        TEST_USER_A,
+        TEST_KB_A,
+        expected_keywords=["7 days"],
+        should_not_contain=["30 days"]
+    )
+    print_test("Retrieval: Tenant A - Refund Window", passed, reason)
+    # Test 2: Tenant B retrieval
+    passed, reason = test_retrieval(
+        client,
+        "What is the refund window?",
+        TEST_TENANT_B,
+        TEST_USER_B,
+        TEST_KB_B,
+        expected_keywords=["30 days"],
+        should_not_contain=["7 days"]
+    )
+    print_test("Retrieval: Tenant B - Refund Window", passed, reason)
+    # Test 3: Tenant isolation (A should not get B's data)
+    passed, reason = test_retrieval(
+        client,
+        "Starter plan price",
+        TEST_TENANT_A,
+        TEST_USER_A,
+        TEST_KB_A,
+        expected_keywords=["499"],
+        should_not_contain=["999"]
+    )
+    print_test("Retrieval: Tenant A - Starter Plan Price (Isolation)", passed, reason)
+    # Test 4: Tenant isolation (B should not get A's data)
+    passed, reason = test_retrieval(
+        client,
+        "Starter plan price",
+        TEST_TENANT_B,
+        TEST_USER_B,
+        TEST_KB_B,
+        expected_keywords=["999"],
+        should_not_contain=["499"]
+    )
+    print_test("Retrieval: Tenant B - Starter Plan Price (Isolation)", passed, reason)
+    # ========== PHASE 3: Chat Tests ==========
+    print_header("Phase 3: Chat Endpoint Tests")
+    # Test 5: Tenant A chat - refund window
+    passed, reason, data = test_chat(
+        client,
+        "What is the refund window?",
+        TEST_TENANT_A,
+        TEST_USER_A,
+        TEST_KB_A,
+        expected_keywords=["7 days"],
+        should_not_contain=["30 days"]
+    )
+    print_test("Chat: Tenant A - Refund Window", passed, reason)
+    # Test 6: Tenant B chat - refund window
+    passed, reason, data = test_chat(
+        client,
+        "What is the refund window?",
+        TEST_TENANT_B,
+        TEST_USER_B,
+        TEST_KB_B,
+        expected_keywords=["30 days"],
+        should_not_contain=["7 days"]
+    )
+    print_test("Chat: Tenant B - Refund Window", passed, reason)
+    # Test 7: Tenant A chat - Starter plan
+    passed, reason, data = test_chat(
+        client,
+        "What is the Starter plan price?",
+        TEST_TENANT_A,
+        TEST_USER_A,
+        TEST_KB_A,
+        expected_keywords=["499"],
+        should_not_contain=["999"]
+    )
+    print_test("Chat: Tenant A - Starter Plan Price", passed, reason)
+    # Test 8: Tenant B chat - Starter plan
+    passed, reason, data = test_chat(
+        client,
+        "What is the Starter plan price?",
+        TEST_TENANT_B,
+        TEST_USER_B,
+        TEST_KB_B,
+        expected_keywords=["999"],
+        should_not_contain=["499"]
+    )
+    print_test("Chat: Tenant B - Starter Plan Price", passed, reason)
+    # Test 9: Hallucination refusal - out of scope
+    passed, reason, data = test_chat(
+        client,
+        "How to integrate ClientSphere with Shopify?",
+        TEST_TENANT_A,
+        TEST_USER_A,
+        TEST_KB_A,
+        should_refuse=True
+    )
+    print_test("Chat: Hallucination Refusal (Out of Scope)", passed, reason)
+    # Test 10: Citation integrity
+    passed, reason, data = test_chat(
+        client,
+        "How long do password reset links last?",
+        TEST_TENANT_A,
+        TEST_USER_A,
+        TEST_KB_A,
+        expected_keywords=["15"]
+    )
+    if passed:
+        citations = data.get("citations", [])
+        if citations:
+            print_test("Chat: Citation Integrity", True, f"Found {len(citations)} citations")
+        else:
+            print_test("Chat: Citation Integrity", False, "No citations provided")
+    else:
+        print_test("Chat: Citation Integrity", False, reason)
+    # ========== PHASE 4: Summary ==========
+    print_header("Test Summary")
+    total_tests = len(test_results)
+    passed_tests = sum(1 for r in test_results if r["passed"])
+    failed_tests = total_tests - passed_tests
+    print(f"\nTotal Tests: {total_tests}")
+    print(f"[PASS] Passed: {passed_tests}")
+    print(f"[FAIL] Failed: {failed_tests}")
+    print(f"Success Rate: {(passed_tests/total_tests*100):.1f}%")
+    if failed_tests > 0:
+        print("\n[FAIL] Failed Tests:")
+        for result in test_results:
+            if not result["passed"]:
+                print(f"  - {result['test']}: {result['reason']}")
+    # Final verdict
+    print_header("Final Verdict")
+    if failed_tests == 0:
+        print("[PASS] ALL TESTS PASSED - RAG Pipeline is working correctly")
+        return 0
+    else:
+        print(f"[FAIL] {failed_tests} TEST(S) FAILED - Review issues above")
+        return 1
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)