""" Smoke test: proves the system is dynamic by modifying a source doc, recompiling, and verifying that skills and agent answers change. Usage: python scripts/smoke_test.py Requires: backend running on http://localhost:8080 """ import requests import time import sys import os import json API = "http://localhost:8080" COMPANY = "rivanly-inc" # Path to a source doc we'll modify BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) SOP_PATH = os.path.join(BASE_DIR, "data", "sources", COMPANY, "notion_refund_sop.md") def check_health(): print("1. Checking API health...") r = requests.get(f"{API}/health") assert r.status_code == 200, f"Health check failed: {r.text}" data = r.json() print(f" API: {data['status']}, vLLM: {data['vllm']}, DB: {data['database']}") return True def read_sop(): with open(SOP_PATH, "r", encoding="utf-8") as f: return f.read() def write_sop(content: str): with open(SOP_PATH, "w", encoding="utf-8") as f: f.write(content) def compile_and_wait(): """Trigger compilation and poll until complete.""" print(" Triggering compilation...") r = requests.post(f"{API}/compile", json={"company_id": COMPANY}) assert r.status_code == 200, f"Compile failed: {r.text}" job_id = r.json()["job_id"] print(f" Job ID: {job_id}") # Poll the compile stream for completion for attempt in range(60): # max 5 minutes time.sleep(5) # Check job status explicitly try: status_req = requests.get(f"{API}/compile/{job_id}/status") if status_req.status_code == 200: job_info = status_req.json() if job_info.get("status") == "error": print(f" [ERROR] Job failed: {job_info.get('error_detail')}") raise RuntimeError( f"Compilation job failed: {job_info.get('error_detail')}" ) if job_info.get("status") == "complete": # Fetch skills sk = requests.get(f"{API}/skills/{COMPANY}") if sk.status_code == 200: data = sk.json() skills = data.get("skills", []) print(f" Compilation produced {len(skills)} skills") return data except Exception as e: if isinstance(e, RuntimeError): raise pass print(f" Waiting... ({(attempt + 1) * 5}s)") # Timeout reached. Fetch final status. final_status = "Unknown" final_error = "None" try: status_req = requests.get(f"{API}/compile/{job_id}/status") if status_req.status_code == 200: job_info = status_req.json() final_status = job_info.get("status", "Unknown") final_error = job_info.get("error_detail", "None") except Exception: pass raise TimeoutError( f"Compilation did not complete within 5 minutes. Final status: {final_status}, Error: {final_error}" ) def get_skills(): r = requests.get(f"{API}/skills/{COMPANY}") assert r.status_code == 200, f"Skills fetch failed: {r.text}" return r.json() def query_agent(scenario: str, context: dict = None): r = requests.post( f"{API}/agent/query", json={ "company_id": COMPANY, "scenario_text": scenario, "json_context": context or {}, }, ) assert r.status_code == 200, f"Agent query failed: {r.text}" return r.json() def test_gibberish(): """Gibberish should get low confidence and no specific action.""" print("\n3. Testing gibberish rejection...") result = query_agent("blah blah blah fafa asdfasdf") confidence = result.get("confidence", 1.0) print(f" Gibberish confidence: {confidence}") print(f" Action: {result.get('recommended_action', 'N/A')}") if confidence < 0.4: print(" [PASS] Low confidence for gibberish") else: print( f" [WARN] Confidence {confidence} is higher than expected for gibberish" ) def test_dynamic_policy_change(): """ Core test: modify the refund SOP, recompile, and verify the change propagates. """ print("\n4. Testing dynamic policy change...") # Save original SOP original_sop = read_sop() print(f" Original SOP loaded ({len(original_sop)} chars)") # Compile with original SOP (this may already be done) print("\n Step A: Compile with ORIGINAL policy...") skills_v1 = compile_and_wait() skills_v1_text = json.dumps(skills_v1) # Query the agent about refunds with original policy print("\n Step B: Query agent about refunds (original policy)...") result_v1 = query_agent( "Customer requesting a refund after 45 days", {"plan": "annual", "days_since_purchase": 45, "tenure_months": 6}, ) print(f" v1 action: {result_v1.get('recommended_action')}") print(f" v1 rule: {result_v1.get('rule_applied', 'N/A')}") # Now modify the SOP - change the refund window print("\n Step C: Modifying SOP (changing refund window)...") modified_sop = ( original_sop.replace("30 day", "60 day") .replace("30-day", "60-day") .replace("30 days", "60 days") ) if modified_sop == original_sop: # Try alternative patterns modified_sop = original_sop.replace("30", "60") write_sop(modified_sop) print(" SOP modified: 30 -> 60 days") # Recompile print("\n Step D: Recompiling with MODIFIED policy...") skills_v2 = compile_and_wait() skills_v2_text = json.dumps(skills_v2) # Check that skills actually changed changed = skills_v1_text != skills_v2_text print(f"\n Skills changed after recompile: {changed}") # Query the agent again print("\n Step E: Query agent about refunds (modified policy)...") result_v2 = query_agent( "Customer requesting a refund after 45 days", {"plan": "annual", "days_since_purchase": 45, "tenure_months": 6}, ) print(f" v2 action: {result_v2.get('recommended_action')}") print(f" v2 rule: {result_v2.get('rule_applied', 'N/A')}") # Check for the policy change in v2 v2_mentions_60 = "60" in json.dumps(result_v2) print(f" v2 references '60': {v2_mentions_60}") # Check if actions actually changed based on policy v1_action_lower = str(result_v1.get("recommended_action", "")).lower() v2_action_lower = str(result_v2.get("recommended_action", "")).lower() # Under 30 days limit (v1), 45 days should be denied/not allowed # Under 60 days limit (v2), 45 days should be approved/prorated policy_executed_correctly = ( "deny" in v1_action_lower or "no refund" in v1_action_lower or "not eligible" in v1_action_lower or "cannot" in v1_action_lower ) and ( "approve" in v2_action_lower or "prorated" in v2_action_lower or "allow" in v2_action_lower ) print( f" Policy execution behavior changed appropriately (Deny -> Approve): {policy_executed_correctly}" ) # Restore original SOP print("\n Step F: Restoring original SOP...") write_sop(original_sop) print(" Original SOP restored.") # Final verdict print("\n --- RESULTS ---") if changed: print(" [PASS] Skills changed after source modification and recompile") else: print(" [FAIL] Skills did NOT change - system may still be static") if policy_executed_correctly: print( " [PASS] Agent correctly executed the policy change (Denied at 45 days under 30-day SOP, Approved under 60-day SOP!)" ) elif v2_mentions_60: print(" [PASS] Agent response reflects the modified policy (60 days)") else: print( " [WARN] Agent response did not change behavior or mention the new policy" ) def test_semantic_diff(): """Test the /diff/{v1}/{v2} endpoint.""" print("\n5. Testing semantic diff engine...") # Get version history r = requests.get(f"{API}/brain/versions/{COMPANY}") if r.status_code != 200: print(" [SKIP] Could not fetch version history") return versions = r.json().get("versions", []) if len(versions) < 2: print(" [SKIP] Need at least 2 compiled versions for diff") return v1 = versions[1]["version"] v2 = versions[0]["version"] print(f" Comparing {v1} → {v2}") r = requests.get(f"{API}/diff/{v1}/{v2}", params={"company_id": COMPANY}) if r.status_code != 200: print(f" [FAIL] Diff endpoint returned {r.status_code}: {r.text}") return diff = r.json() summary = diff.get("summary", {}) print( f" Added: {summary.get('added_count', 0)}, Deleted: {summary.get('deleted_count', 0)}, Modified: {summary.get('modified_count', 0)}" ) print(f" Confidence shifts: {summary.get('confidence_shift_count', 0)}") print( f" V1 skills: {summary.get('v1_skills', 0)} → V2 skills: {summary.get('v2_skills', 0)}" ) if ( summary.get("added_count", 0) > 0 or summary.get("modified_count", 0) > 0 or summary.get("deleted_count", 0) > 0 or summary.get("confidence_shift_count", 0) > 0 ): print(" [PASS] Semantic diff detected changes between versions") else: print( " [WARN] Diff returned no changes — may indicate skills didn't change or diff has a bug" ) def main(): print("=" * 60) print("KERNL SMOKE TEST — Proving the system is dynamic") print("=" * 60) try: check_health() except Exception as e: print(f" [FATAL] API not reachable: {e}") print( " Make sure backend is running: python -m uvicorn backend.main:app --port 8080" ) sys.exit(1) # Test 1: Compile and get skills print("\n2. Initial compilation...") try: skills = compile_and_wait() print(f" Got {len(skills.get('skills', []))} skills") except Exception as e: print(f" [ERROR] Compilation failed: {e}") sys.exit(1) # Test 2: Gibberish rejection try: test_gibberish() except Exception as e: print(f" [ERROR] Gibberish test failed: {e}") # Test 3: Dynamic policy change try: test_dynamic_policy_change() except Exception as e: print(f" [ERROR] Dynamic test failed: {e}") # Make sure we restore the SOP if os.path.exists(SOP_PATH): print(" Attempting to restore original SOP...") # Test 4: Semantic diff try: test_semantic_diff() except Exception as e: print(f" [ERROR] Diff test failed: {e}") print("\n" + "=" * 60) print("SMOKE TEST COMPLETE") print("=" * 60) if __name__ == "__main__": main()