Spaces:
Sleeping
Sleeping
| """ | |
| Local test script β verifies the environment works without needing an LLM API. | |
| Run: uv run python test_local.py | |
| """ | |
| import json | |
| import sys | |
| # Test imports | |
| print("=" * 60) | |
| print("Testing Policy-to-Logic RL Environment") | |
| print("=" * 60) | |
| # ββ Test 1: DSL Engine ββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n[1/5] Testing DSL Engine...") | |
| from policy_to_logic_env.server.dsl_engine import parse_rules, execute_rules, validate_rules | |
| rules_json = json.dumps({ | |
| "rules": [ | |
| { | |
| "if": [ | |
| {"field": "time", "op": ">=", "value": 18}, | |
| {"field": "data_type", "op": "==", "value": "sensitive"} | |
| ], | |
| "then": "DENY" | |
| }, | |
| { | |
| "if": [ | |
| {"field": "time", "op": "<", "value": 9}, | |
| {"field": "data_type", "op": "==", "value": "sensitive"} | |
| ], | |
| "then": "DENY" | |
| }, | |
| { | |
| "if": [ | |
| {"field": "time", "op": ">=", "value": 18}, | |
| {"field": "data_type", "op": "==", "value": "internal"} | |
| ], | |
| "then": "DENY" | |
| }, | |
| { | |
| "if": [ | |
| {"field": "time", "op": "<", "value": 9}, | |
| {"field": "data_type", "op": "==", "value": "internal"} | |
| ], | |
| "then": "DENY" | |
| } | |
| ], | |
| "default": "ALLOW" | |
| }) | |
| rules_data, errors = parse_rules(rules_json) | |
| assert rules_data is not None, f"Parse failed: {errors}" | |
| assert len(errors) == 0 | |
| # Test execution | |
| result = execute_rules(rules_data, {"time": 20, "data_type": "sensitive"}) | |
| assert result == "DENY", f"Expected DENY, got {result}" | |
| result = execute_rules(rules_data, {"time": 12, "data_type": "sensitive"}) | |
| assert result == "ALLOW", f"Expected ALLOW, got {result}" | |
| result = execute_rules(rules_data, {"time": 22, "data_type": "public"}) | |
| assert result == "ALLOW", f"Expected ALLOW, got {result}" | |
| print(" β DSL Engine working correctly") | |
| # ββ Test 2: Scenario Generator ββββββββββββββββββββββββββββββββββββ | |
| print("\n[2/5] Testing Scenario Generator...") | |
| from policy_to_logic_env.server.scenario_generator import generate_scenarios | |
| for task_name in ["data_access", "resource_access", "transaction_approval"]: | |
| scenarios = generate_scenarios(task_name) | |
| assert len(scenarios) > 0, f"No scenarios for {task_name}" | |
| assert all("expected_decision" in s for s in scenarios), f"Missing expected_decision in {task_name}" | |
| print(f" β {task_name}: {len(scenarios)} scenarios generated") | |
| # ββ Test 3: Ground Truth βββββββββββββββββββββββββββββββββββββββββ | |
| print("\n[3/5] Testing Ground Truth Engine...") | |
| from policy_to_logic_env.server.ground_truth import evaluate_ground_truth, answer_clarification | |
| # Test data_access | |
| assert evaluate_ground_truth("data_access", {"time": 20, "data_type": "sensitive"}) == "DENY" | |
| assert evaluate_ground_truth("data_access", {"time": 12, "data_type": "sensitive"}) == "ALLOW" | |
| assert evaluate_ground_truth("data_access", {"time": 3, "data_type": "public"}) == "ALLOW" | |
| # Test resource_access | |
| assert evaluate_ground_truth("resource_access", {"role": "senior", "time": 3, "document_type": "confidential"}) == "ALLOW" | |
| assert evaluate_ground_truth("resource_access", {"role": "contractor", "time": 12, "document_type": "internal"}) == "DENY" | |
| assert evaluate_ground_truth("resource_access", {"role": "junior", "time": 12, "document_type": "internal"}) == "ALLOW" | |
| # Test transaction_approval | |
| assert evaluate_ground_truth("transaction_approval", {"amount": 100, "transfer_type": "international", "time": 12, "initiator_role": "employee"}) == "COMPLIANCE_REVIEW" | |
| assert evaluate_ground_truth("transaction_approval", {"amount": 10000, "transfer_type": "domestic", "time": 20, "initiator_role": "employee"}) == "HOLD" | |
| assert evaluate_ground_truth("transaction_approval", {"amount": 6000, "transfer_type": "domestic", "time": 12, "initiator_role": "manager"}) == "APPROVE" | |
| # Test clarification oracle | |
| answer = answer_clarification("transaction_approval", "What is the standard limit?") | |
| assert "5,000" in answer | |
| print(" β Ground Truth and Oracle working correctly") | |
| # ββ Test 4: Graders βββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n[4/5] Testing Graders...") | |
| from policy_to_logic_env.server.graders import grade_task | |
| # Grade a perfect ruleset for data_access | |
| perfect_rules = { | |
| "rules": [ | |
| { | |
| "if": [ | |
| {"field": "time", "op": ">=", "value": 18}, | |
| {"field": "data_type", "op": "==", "value": "sensitive"} | |
| ], | |
| "then": "DENY" | |
| }, | |
| { | |
| "if": [ | |
| {"field": "time", "op": "<", "value": 9}, | |
| {"field": "data_type", "op": "==", "value": "sensitive"} | |
| ], | |
| "then": "DENY" | |
| }, | |
| { | |
| "if": [ | |
| {"field": "time", "op": ">=", "value": 18}, | |
| {"field": "data_type", "op": "==", "value": "internal"} | |
| ], | |
| "then": "DENY" | |
| }, | |
| { | |
| "if": [ | |
| {"field": "time", "op": "<", "value": 9}, | |
| {"field": "data_type", "op": "==", "value": "internal"} | |
| ], | |
| "then": "DENY" | |
| } | |
| ], | |
| "default": "ALLOW" | |
| } | |
| score, details = grade_task("data_access", perfect_rules) | |
| print(f" Perfect rules score: {score:.2%} ({details['passed']}/{details['total']})") | |
| assert score >= 0.9, f"Perfect rules should score >=0.9, got {score}" | |
| # Grade an empty ruleset | |
| empty_rules = {"rules": [], "default": "ALLOW"} | |
| score_empty, details_empty = grade_task("data_access", empty_rules) | |
| print(f" Empty rules score: {score_empty:.2%} ({details_empty['passed']}/{details_empty['total']})") | |
| print(" β Graders working correctly") | |
| # ββ Test 5: Full Environment Loop βββββββββββββββββββββββββββββββββ | |
| print("\n[5/5] Testing Full Environment Loop...") | |
| from policy_to_logic_env.server.environment import PolicyToLogicEnvironment | |
| from policy_to_logic_env.models import PolicyToLogicAction | |
| env = PolicyToLogicEnvironment() | |
| # Reset | |
| result = env.reset(task_name="data_access") | |
| assert not result.done | |
| assert result.observation.task_name == "data_access" | |
| assert result.observation.step_number == 0 | |
| print(f" Reset OK. Policy: {result.observation.policy_text[:60]}...") | |
| # Step 1: Ask clarification | |
| result = env.step(PolicyToLogicAction( | |
| action_type="ask_clarification", | |
| content=json.dumps({"question": "What are working hours?"}) | |
| )) | |
| assert not result.done | |
| assert result.observation.clarification_response is not None | |
| print(f" Step 1 (clarify): answer='{result.observation.clarification_response[:60]}...', reward={result.reward:.2f}") | |
| # Step 2: Propose rules | |
| result = env.step(PolicyToLogicAction( | |
| action_type="propose_rules", | |
| content=json.dumps(perfect_rules) | |
| )) | |
| print(f" Step 2 (propose): accuracy={result.observation.current_accuracy:.2%}, reward={result.reward:.2f}, done={result.done}") | |
| # Check state | |
| state = env.state() | |
| print(f" State: episode={state.episode_id}, steps={state.step_count}, questions={state.questions_asked}") | |
| print(" β Full environment loop working correctly") | |
| # ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "=" * 60) | |
| print("π ALL TESTS PASSED! Environment is working correctly.") | |
| print("=" * 60) | |
| print("\nNext steps:") | |
| print(" 1. Start server: uv run python main.py") | |
| print(" 2. Test API: curl -X POST http://localhost:7860/reset -H 'Content-Type: application/json' -d '{}'") | |
| print(" 3. Run inference: HF_TOKEN=xxx uv run python inference.py") | |