Spaces:

Godreign
/

Policy2Logic

Running

App Files Files Community

Godreign-Y commited on 19 days ago

Commit

8942e3d

1 Parent(s): 7b82b54

after redesign with claude

Browse files

Files changed (3) hide show

policy_to_logic_env/server/ground_truth.py +30 -12
policy_to_logic_env/server/policies.py +281 -30
test_hf_spaces.py +175 -53

policy_to_logic_env/server/ground_truth.py CHANGED Viewed

@@ -133,11 +133,23 @@ def _ground_truth_transaction_approval(s: dict) -> str:
 def answer_clarification(task_name: str, question: str) -> str:
     """
-    Deterministic clarification oracle.
-    Matches question text against known keywords for the given task
-    and returns a structured answer. If no match is found, returns
-    a generic response.
     Args:
         task_name: Current task name
@@ -149,22 +161,28 @@ def answer_clarification(task_name: str, question: str) -> str:
     task = get_task(task_name)
     question_lower = question.lower().strip()
-    # Check each keyword in the clarification map
     best_match = None
-    best_match_len = 0
     for keyword, answer in task.clarification_map.items():
-        if keyword.lower() in question_lower:
-            # Prefer longer keyword matches (more specific)
-            if len(keyword) > best_match_len:
                 best_match = answer
-                best_match_len = len(keyword)
     if best_match:
         return best_match
     return (
         "I can provide information about the specific terms and parameters "
-        "mentioned in the policy. Please ask about a specific aspect such as "
-        "time constraints, roles, thresholds, or document/data types."
     )

 def answer_clarification(task_name: str, question: str) -> str:
     """
+    Deterministic clarification oracle with progressive revelation.
+    Uses compound keyword matching to provide layered answers:
+      - Vague questions (match short keywords) → partial, potentially
+        ambiguous truths that may mislead if taken at face value.
+      - Specific questions (match long/compound keywords) → precise,
+        ground-truth-aligned answers.
+    Compound keywords: if a keyword contains spaces, ALL space-separated
+    words must appear anywhere in the question (order-independent).
+    More matched keywords = higher priority (more specific answer wins).
+    This design supports RL training where agents must learn to:
+      1. Detect ambiguity in initial policy text
+      2. Ask targeted questions to resolve ambiguity
+      3. Recognize when earlier (vague) answers were misleading
+      4. Reconcile contradictory signals by drilling deeper
     Args:
         task_name: Current task name
     task = get_task(task_name)
     question_lower = question.lower().strip()
     best_match = None
+    best_match_score = (0, 0)  # (num_parts, total_length)
     for keyword, answer in task.clarification_map.items():
+        keyword_lower = keyword.lower()
+        keyword_parts = keyword_lower.split()
+        # ALL parts of the keyword must appear in the question
+        if all(part in question_lower for part in keyword_parts):
+            # Score: more keyword parts = more specific = higher priority
+            # Tiebreak by total keyword length
+            score = (len(keyword_parts), len(keyword_lower))
+            if score > best_match_score:
                 best_match = answer
+                best_match_score = score
     if best_match:
         return best_match
     return (
         "I can provide information about the specific terms and parameters "
+        "mentioned in the policy. Try asking about specific aspects like "
+        "time boundaries, exact thresholds, role-specific permissions, "
+        "or how specific edge cases are handled."
     )

policy_to_logic_env/server/policies.py CHANGED Viewed

@@ -54,12 +54,70 @@ DATA_ACCESS = TaskConfig(
         "work_end": 18,
     },
     clarification_map={
-        "working hours": "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format).",
-        "work hours": "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format).",
-        "sensitive": "Sensitive data includes personal records, financial data, and proprietary information.",
-        "internal": "Internal data follows the same access rules as sensitive data.",
-        "public": "Public data has no access restrictions and can be accessed at any time.",
-        "after hours": "After hours means any time before 9:00 AM or after 6:00 PM (before 9 or after 18).",
     },
     max_steps=5,
     scenario_count=30,
@@ -89,15 +147,98 @@ RESOURCE_ACCESS = TaskConfig(
         "business_end": 17,
     },
     clarification_map={
-        "business hours": "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format).",
-        "work hours": "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format).",
-        "junior": "Junior employees are entry-level staff. They can access public and internal documents during business hours, but not confidential documents outside business hours.",
-        "senior": "Senior employees have unrestricted access to all documents at all times.",
-        "contractor": "Contractors can only access public documents. They cannot access internal or confidential documents at any time.",
-        "confidential": "Confidential documents include board minutes, salary data, and strategic plans.",
-        "internal": "Internal documents include team wikis, project plans, and internal communications.",
-        "public": "Public documents include published reports, press releases, and public-facing content.",
-        "outside business": "Outside business hours means before 8:00 AM or after 5:00 PM.",
     },
     max_steps=7,
     scenario_count=50,
@@ -132,21 +273,131 @@ TRANSACTION_APPROVAL = TaskConfig(
         "business_end": 17,
     },
     clarification_map={
-        "standard limit": "The standard transaction limit is $5,000. Transactions above this amount require manager approval.",
-        "limit": "The standard transaction limit is $5,000.",
-        "threshold": "The standard limit is $5,000. The high-value threshold for domestic transactions is $10,000.",
-        "high-value": "High-value domestic transactions are those with an amount of $10,000 or more.",
-        "high value": "High-value domestic transactions are those with an amount of $10,000 or more.",
-        "business hours": "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format).",
-        "work hours": "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format).",
-        "international": "All international transfers require compliance review, regardless of amount or time.",
-        "manager": "Manager-initiated transactions are exempt from the standard limit requirement. They are auto-approved for domestic transactions unless high-value and outside business hours.",
-        "compliance": "Compliance review is required for all international transfers. It is a separate process from manager approval.",
-        "routine": "Routine transactions are domestic transactions within the standard limit ($5,000 or less).",
-        "exempt": "Manager-initiated transactions are exempt from the standard limit. However, they still follow high-value and international rules.",
-        "non-business": "Non-business hours means before 9:00 AM or after 5:00 PM.",
-        "system": "System-initiated transactions follow the same rules as employee-initiated ones.",
-        "domestic": "Domestic transactions that are within limits are auto-approved. High-value ones outside business hours are held.",
     },
     max_steps=7,
     scenario_count=80,

         "work_end": 18,
     },
     clarification_map={
+        # ── Level 1: General (single short keyword → partial/ambiguous) ──
+        "hours": (
+            "Working hours are from 9 AM to 6 PM."
+        ),
+        "sensitive": (
+            "Sensitive data includes personal records, financial data, "
+            "and proprietary information."
+        ),
+        "internal": (
+            "Internal data follows the same access rules as sensitive data."
+        ),
+        "public": (
+            "Public data has no access restrictions and can be accessed "
+            "at any time."
+        ),
+        "access": (
+            "Access depends on the data type and the current hour. "
+            "Public data is unrestricted. Other types have time-based rules."
+        ),
+        # ── Level 2: Medium specificity (common phrases → more detail) ──
+        "working hours": (
+            "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format). "
+            "Sensitive and internal data can only be accessed during this window."
+        ),
+        "work hours": (
+            "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format)."
+        ),
+        "after hours": (
+            "After hours means outside the working hours window. "
+            "This includes early morning hours and evening hours from 6 PM onward."
+        ),
+        # ── Level 3: Precise (compound/specific → ground truth aligned) ──
+        "working hours boundary": (
+            "Working hours use a half-open interval: hour >= 9 AND hour < 18. "
+            "Hour 9 is the first working hour. Hour 17 is the last working hour. "
+            "Hour 18 (6:00 PM) is NOT within working hours — it is the start of "
+            "after-hours."
+        ),
+        "exactly 18": (
+            "Hour 18 (6:00 PM) is considered after hours. The working hours "
+            "window ends BEFORE 18, so 18 is outside. Access to sensitive "
+            "and internal data is denied at hour 18."
+        ),
+        "time boundary": (
+            "The time boundaries are strict: working hours are "
+            "hour >= 9 AND hour < 18. Hour 9 is inside working hours, "
+            "hour 18 is outside. The last valid working hour is 17."
+        ),
+        "sensitive time": (
+            "Sensitive data can only be accessed when the hour is >= 9 AND "
+            "strictly less than 18. At hour 18, access is denied."
+        ),
+        "internal time": (
+            "Internal data follows the exact same time rules as sensitive "
+            "data: allowed when hour >= 9 AND hour < 18."
+        ),
+        "deny allow": (
+            "The decision is ALLOW for public data at any time, or for "
+            "sensitive/internal data during hours 9 through 17 (inclusive). "
+            "The decision is DENY for sensitive/internal data at hours 0-8 "
+            "and 18-23."
+        ),
     },
     max_steps=5,
     scenario_count=30,
         "business_end": 17,
     },
     clarification_map={
+        # ── Level 1: General (single keyword → partial/ambiguous truths) ──
+        # NOTE: "junior" answer is technically true but intentionally
+        # incomplete — it only mentions the "outside business hours"
+        # restriction, which can mislead the agent into thinking
+        # confidential IS accessible during business hours.
+        "junior": (
+            "Junior employees are entry-level staff. They can access public "
+            "and internal documents during business hours, but not confidential "
+            "documents outside business hours."
+        ),
+        "senior": (
+            "Senior employees have unrestricted access to all documents "
+            "at all times."
+        ),
+        "contractor": (
+            "Contractors can only access public documents. They cannot access "
+            "internal or confidential documents at any time."
+        ),
+        "confidential": (
+            "Confidential documents include board minutes, salary data, and "
+            "strategic plans. Access is highly restricted."
+        ),
+        "internal": (
+            "Internal documents include team wikis, project plans, and "
+            "internal communications."
+        ),
+        "public": (
+            "Public documents include published reports, press releases, and "
+            "public-facing content. No access restrictions."
+        ),
+        "hours": (
+            "Business hours are 8 AM to 5 PM."
+        ),
+        # ── Level 2: Medium specificity (common phrases → more detail) ──
+        "business hours": (
+            "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format). "
+            "Access permissions change based on whether the current hour falls "
+            "within this range."
+        ),
+        "work hours": (
+            "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format)."
+        ),
+        "outside business": (
+            "Outside business hours includes early morning and evening. "
+            "Restrictions are tighter outside this window for junior staff."
+        ),
+        # ── Level 3: Precise (compound keywords → ground truth aligned) ──
+        # These answers reveal the FULL truth, correcting any misleading
+        # impressions from Level 1 answers.
+        "junior confidential": (
+            "Junior employees CANNOT access confidential documents at ANY time, "
+            "regardless of whether it is during or outside business hours. "
+            "The policy statement about 'outside business hours' is a minimum "
+            "restriction — the actual rule is a blanket denial of confidential "
+            "access for juniors."
+        ),
+        "junior internal": (
+            "Junior employees can access internal documents ONLY during business "
+            "hours (hour >= 8 AND hour < 17). Outside business hours, internal "
+            "documents are denied for juniors."
+        ),
+        "junior public": (
+            "Junior employees can access public documents at any time. "
+            "Public access has no restrictions for any role."
+        ),
+        "business hours boundary": (
+            "Business hours use a half-open interval: hour >= 8 AND hour < 17. "
+            "Hour 8 is the first business hour. Hour 16 is the last business hour. "
+            "Hour 17 (5:00 PM) is NOT within business hours."
+        ),
+        "exactly 17": (
+            "Hour 17 (5:00 PM) is considered outside business hours. "
+            "The business hours window ends BEFORE 17. Junior employees lose "
+            "access to internal documents at hour 17."
+        ),
+        "time boundary": (
+            "Business hours are hour >= 8 AND hour < 17. "
+            "Hour 8 is inside, hour 17 is outside. "
+            "The last valid business hour is 16."
+        ),
+        "confidential during": (
+            "Confidential documents are NOT accessible to junior employees "
+            "during business hours. The policy only explicitly mentions the "
+            "'outside business hours' restriction, but the actual rule denies "
+            "junior access to confidential at all times."
+        ),
+        "contractor internal": (
+            "Contractors cannot access internal documents. They are restricted "
+            "to public documents only, regardless of time."
+        ),
     },
     max_steps=7,
     scenario_count=50,
         "business_end": 17,
     },
     clarification_map={
+        # ── Level 1: General (single keyword → partial/vague) ──
+        "limit": (
+            "The standard transaction limit is $5,000."
+        ),
+        "international": (
+            "All international transfers require compliance review, "
+            "regardless of amount or time."
+        ),
+        "manager": (
+            "Manager-initiated transactions are exempt from the standard "
+            "limit requirement. They are auto-approved for domestic "
+            "transactions unless other rules apply."
+        ),
+        "domestic": (
+            "Domestic transactions follow different rules based on amount "
+            "and time of day."
+        ),
+        "compliance": (
+            "Compliance review is required for all international transfers. "
+            "It is a separate process from manager approval."
+        ),
+        "routine": (
+            "Routine transactions are domestic transactions within the "
+            "standard limit."
+        ),
+        "system": (
+            "System-initiated transactions follow the same rules as "
+            "employee-initiated ones."
+        ),
+        "hours": (
+            "Business hours are 9 AM to 5 PM."
+        ),
+        "exempt": (
+            "Manager-initiated transactions are exempt from the standard "
+            "limit. However, other rules may still apply."
+        ),
+        # ── Level 2: Medium specificity ──
+        "standard limit": (
+            "The standard transaction limit is $5,000. Transactions above "
+            "this amount require manager approval unless the initiator is "
+            "a manager."
+        ),
+        "high-value": (
+            "High-value domestic transactions are those with an amount of "
+            "$10,000 or more."
+        ),
+        "high value": (
+            "High-value domestic transactions are those with an amount of "
+            "$10,000 or more."
+        ),
+        "threshold": (
+            "The standard limit is $5,000. The high-value threshold for "
+            "domestic transactions is $10,000."
+        ),
+        "business hours": (
+            "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format)."
+        ),
+        "work hours": (
+            "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format)."
+        ),
+        "non-business": (
+            "Non-business hours means outside the 9 AM to 5 PM window, "
+            "including the 5 PM hour itself."
+        ),
+        # ── Level 3: Precise (compound keywords → ground truth) ──
+        "exactly 5000": (
+            "A transaction of exactly $5,000 is WITHIN the standard limit and "
+            "is auto-approved for domestic transactions. Only amounts STRICTLY "
+            "above $5,000 (i.e., $5,001+) trigger the approval requirement. "
+            "The comparison is amount > 5000, not amount >= 5000."
+        ),
+        "exactly 10000": (
+            "A domestic transaction of exactly $10,000 IS considered high-value. "
+            "The threshold is amount >= 10000. A $9,999 transaction is NOT "
+            "high-value."
+        ),
+        "exactly 17": (
+            "Hour 17 (5:00 PM) is considered non-business hours. The business "
+            "hours window ends BEFORE 17. A high-value domestic transaction at "
+            "hour 17 would be held for review."
+        ),
+        "business hours boundary": (
+            "Business hours use a half-open interval: hour >= 9 AND hour < 17. "
+            "Hour 9 is business hours. Hour 16 is the last business hour. "
+            "Hour 17 (5:00 PM) is NOT business hours."
+        ),
+        "time boundary": (
+            "Business hours are hour >= 9 AND hour < 17. "
+            "Hour 9 is inside, hour 17 is outside."
+        ),
+        "manager exempt": (
+            "Manager-initiated transactions are exempt from the standard $5,000 "
+            "limit only. They are NOT exempt from international compliance review "
+            "or the high-value domestic HOLD rule. A manager's $10,000 domestic "
+            "transaction outside business hours is still HELD."
+        ),
+        "manager high-value": (
+            "Manager exemption only applies to the standard limit ($5,000). "
+            "Managers are still subject to: (1) COMPLIANCE_REVIEW for international "
+            "transfers, and (2) HOLD for high-value domestic transactions (>= $10,000) "
+            "outside business hours. The exemption is narrow."
+        ),
+        "manager international": (
+            "Even manager-initiated international transfers require COMPLIANCE_REVIEW. "
+            "The manager exemption does not override the international transfer rule."
+        ),
+        "domestic hold": (
+            "A domestic transaction is HELD when: (1) amount >= $10,000 AND "
+            "(2) the hour is outside business hours (hour < 9 or hour >= 17). "
+            "Both conditions must be true. During business hours, high-value "
+            "domestic transactions get REQUIRE_APPROVAL instead (if not manager-initiated)."
+        ),
+        "rule priority": (
+            "Rules are evaluated in priority order: "
+            "(1) International transfers → COMPLIANCE_REVIEW always. "
+            "(2) High-value domestic (>= $10,000) outside business hours → HOLD. "
+            "(3) Above standard limit (> $5,000) and not manager → REQUIRE_APPROVAL. "
+            "(4) Everything else → APPROVE."
+        ),
+        "international manager": (
+            "International transfers ALWAYS go to COMPLIANCE_REVIEW, even for "
+            "managers. This is the highest-priority rule."
+        ),
     },
     max_steps=7,
     scenario_count=80,

test_hf_spaces.py CHANGED Viewed

@@ -4,6 +4,7 @@ HF Spaces Test Runner - Policy-to-Logic Environment
 Tests all endpoints on the deployed HF Spaces and generates a report.
 Run it:
     uv run python test_hf_spaces.py
 """
@@ -12,12 +13,16 @@ import json
 import time
 from typing import Dict, Any, List
 from datetime import datetime
-from urllib.parse import urljoin
-# HF Spaces URL
-HF_SPACE_URL = "https://huggingface.co/spaces/Godreign/Policy2Logic"
-# Extract the actual API endpoint
-BASE_URL = HF_SPACE_URL
 class HFSpacesTestRunner:
     def __init__(self, base_url: str):
@@ -26,7 +31,7 @@ class HFSpacesTestRunner:
         self.passed = 0
         self.failed = 0
         self.start_time = datetime.now()
     def log(self, message: str, level: str = "INFO"):
         """Print formatted log message."""
         timestamp = datetime.now().strftime("%H:%M:%S")
@@ -38,22 +43,81 @@ class HFSpacesTestRunner:
             "TEST": "🧪"
         }
         print(f"[{timestamp}] {prefix.get(level, '')} {message}")
-    def test_endpoint(self, method: str, endpoint: str, data: dict = None,
-                      description: str = "", timeout: int = 10) -> bool:
         """Test an HF Spaces endpoint and record result."""
-        url = urljoin(self.base_url, endpoint)
         display_name = description or endpoint
         self.log(f"Testing: {display_name}", "TEST")
         try:
             if method == "POST":
                 response = requests.post(url, json=data, timeout=timeout)
             else:
                 response = requests.get(url, timeout=timeout)
             success = response.status_code in [200, 201]
             result = {
                 "test": display_name,
                 "endpoint": endpoint,
@@ -64,14 +128,14 @@ class HFSpacesTestRunner:
                 "response": None,
                 "error": None
             }
             try:
                 result["response"] = response.json()
-            except:
                 result["response"] = response.text[:300]
             self.results.append(result)
             if success:
                 self.passed += 1
                 self.log(f"  Status: {response.status_code} - PASSED", "SUCCESS")
@@ -79,9 +143,9 @@ class HFSpacesTestRunner:
                 self.failed += 1
                 self.log(f"  Status: {response.status_code} - FAILED", "ERROR")
                 result["error"] = response.text[:200]
             return success
         except requests.exceptions.Timeout:
             self.failed += 1
             self.log(f"  TIMEOUT (>{timeout}s)", "ERROR")
@@ -124,63 +188,107 @@ class HFSpacesTestRunner:
                 "error": str(e)
             })
             return False
     def run_tests(self) -> bool:
         """Run all endpoint tests."""
-        self.log("Starting HF Spaces test suite...", "INFO")
         print("\n" + "="*70)
         # Test 1: Root endpoint
         self.test_endpoint("GET", "/", description="Root Endpoint")
         # Test 2: Health check
         self.test_endpoint("GET", "/health", description="Health Check")
         # Test 3: Root with query params (HF Spaces probe)
         self.test_endpoint("GET", "/?logs=container", description="Root with Logs Query")
         # Test 4: List tasks
         self.test_endpoint("GET", "/tasks", description="List Available Tasks")
         # Test 5: Reset environment
         reset_result = self.test_endpoint(
-            "POST",
-            "/reset",
             data={"task_name": None},
             description="Reset Environment (Start Episode)"
         )
         # Test 6: Get state
         self.test_endpoint("GET", "/state", description="Get Current State")
-        # Test 7: Ask clarification
         if reset_result:
             self.test_endpoint(
                 "POST",
                 "/step",
                 data={
                     "action_type": "ask_clarification",
-                    "content": "What are the business hours?"
                 },
-                description="Step: Ask Clarification"
             )
-        # Test 8: Final state
-        self.test_endpoint("GET", "/state", description="Get Final State After Step")
         print("="*70 + "\n")
         return self.failed == 0
     def generate_report(self):
         """Generate and print test report."""
         duration = (datetime.now() - self.start_time).total_seconds()
         total = self.passed + self.failed
         success_rate = 100 * self.passed / total if total > 0 else 0
         print("\n" + "="*70)
         print("📊 HF SPACES TEST REPORT")
         print("="*70)
         print(f"Space URL:      {self.base_url}")
         print(f"Timestamp:      {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
         print(f"Duration:       {duration:.2f}s")
         print(f"Total Tests:    {total}")
@@ -188,7 +296,7 @@ class HFSpacesTestRunner:
         print(f"Failed:         {self.failed} ❌")
         print(f"Success Rate:   {success_rate:.1f}%")
         print("="*70)
         print("\n📋 DETAILED RESULTS:\n")
         for i, result in enumerate(self.results, 1):
             status_icon = "✅" if result["success"] else "❌"
@@ -197,41 +305,53 @@ class HFSpacesTestRunner:
             print(f"   URL:      {result['url']}")
             if result['status_code']:
                 print(f"   Status:   {result['status_code']}")
             if result['error']:
                 print(f"   Error:    {result['error']}")
             elif result['response']:
                 response_preview = result['response']
                 if isinstance(response_preview, dict):
-                    response_preview = json.dumps(response_preview, indent=4)[:200]
                 else:
-                    response_preview = str(response_preview)[:200]
                 print(f"   Response: {response_preview}...")
             print()
         print("="*70)
         if self.failed == 0:
             print("🎉 ALL TESTS PASSED - HF SPACES IS RUNNING!")
         else:
             print(f"⚠️  {self.failed} test(s) failed. Check details above.")
         print("="*70 + "\n")
         return self.failed == 0
     def run(self) -> bool:
         """Run the entire test suite."""
         print("\n🚀 Policy-to-Logic RL Environment - HF Spaces Test Suite\n")
-        self.log(f"Target: {self.base_url}", "INFO")
-        # Run tests
         all_passed = self.run_tests()
-        # Generate report
         self.generate_report()
         return all_passed
 def main():
     runner = HFSpacesTestRunner(BASE_URL)
     try:
@@ -242,6 +362,8 @@ def main():
         exit(1)
     except Exception as e:
         print(f"\n❌ Unexpected error: {e}")
         exit(1)
 if __name__ == "__main__":

 Tests all endpoints on the deployed HF Spaces and generates a report.
 Run it:
+    $env:UV_LINK_MODE="copy"
     uv run python test_hf_spaces.py
 """
 import time
 from typing import Dict, Any, List
 from datetime import datetime
+# ─── URL Construction ─────────────────────────────────────────────
+# HF Spaces URL → API endpoint
+# https://huggingface.co/spaces/{user}/{repo} → https://{user}-{repo}.hf.space
+HF_SPACE_WEB_URL = "https://huggingface.co/spaces/Godreign/Policy2Logic"
+parts = HF_SPACE_WEB_URL.split('/')
+username = parts[-2]   # "Godreign"
+repo_name = parts[-1]  # "Policy2Logic"
+BASE_URL = f"https://{username.lower()}-{repo_name.lower()}.hf.space"
 class HFSpacesTestRunner:
     def __init__(self, base_url: str):
         self.passed = 0
         self.failed = 0
         self.start_time = datetime.now()
     def log(self, message: str, level: str = "INFO"):
         """Print formatted log message."""
         timestamp = datetime.now().strftime("%H:%M:%S")
             "TEST": "🧪"
         }
         print(f"[{timestamp}] {prefix.get(level, '')} {message}")
+    # ── Connectivity Check ────────────────────────────────────────
+    def check_connectivity(self) -> bool:
+        """
+        Verify we can reach the HF Space before running tests.
+        Returns True if the space is reachable and responding.
+        """
+        print(f"\n🔗 Connectivity Check")
+        print(f"   Target URL: {self.base_url}")
+        print()
+        try:
+            # First, check with allow_redirects=False to detect proxy issues
+            resp = requests.get(
+                self.base_url,
+                timeout=15,
+                allow_redirects=False,
+            )
+            print(f"   Direct response:  status={resp.status_code}")
+            if resp.is_redirect or resp.is_permanent_redirect:
+                redirect_url = resp.headers.get("Location", "unknown")
+                print(f"   ⚠️  REDIRECT detected → {redirect_url}")
+                print(f"   The space may not be running or the URL format changed.")
+                print(f"   Expected API base: {self.base_url}")
+                return False
+            # Now check with redirects allowed (normal mode)
+            resp = requests.get(self.base_url, timeout=15)
+            if resp.status_code == 200:
+                try:
+                    data = resp.json()
+                    if data.get("status") == "running":
+                        print(f"   ✅ Space is RUNNING")
+                        print(f"   Response: {json.dumps(data, indent=2)[:200]}")
+                        return True
+                    else:
+                        print(f"   ⚠️  Unexpected response: {data}")
+                        return True  # Still reachable
+                except ValueError:
+                    print(f"   ⚠️  Non-JSON response (may be HF loading page)")
+                    print(f"   Body preview: {resp.text[:200]}")
+                    return False
+            else:
+                print(f"   ❌ Got status {resp.status_code}")
+                print(f"   Body: {resp.text[:200]}")
+                return False
+        except requests.exceptions.Timeout:
+            print(f"   ❌ Connection TIMEOUT (>15s)")
+            print(f"   The space may be sleeping. Visit {HF_SPACE_WEB_URL} to wake it.")
+            return False
+        except requests.exceptions.ConnectionError as e:
+            print(f"   ❌ Connection FAILED: {str(e)[:150]}")
+            return False
+        except Exception as e:
+            print(f"   ❌ Unexpected error: {e}")
+            return False
+    # ── Endpoint Testing ──────────────────────────────────────────
+    def test_endpoint(self, method: str, endpoint: str, data: dict = None,
+                      description: str = "", timeout: int = 15) -> bool:
         """Test an HF Spaces endpoint and record result."""
+        url = f"{self.base_url}{endpoint}"
         display_name = description or endpoint
         self.log(f"Testing: {display_name}", "TEST")
         try:
             if method == "POST":
                 response = requests.post(url, json=data, timeout=timeout)
             else:
                 response = requests.get(url, timeout=timeout)
             success = response.status_code in [200, 201]
             result = {
                 "test": display_name,
                 "endpoint": endpoint,
                 "response": None,
                 "error": None
             }
             try:
                 result["response"] = response.json()
+            except ValueError:
                 result["response"] = response.text[:300]
             self.results.append(result)
             if success:
                 self.passed += 1
                 self.log(f"  Status: {response.status_code} - PASSED", "SUCCESS")
                 self.failed += 1
                 self.log(f"  Status: {response.status_code} - FAILED", "ERROR")
                 result["error"] = response.text[:200]
             return success
         except requests.exceptions.Timeout:
             self.failed += 1
             self.log(f"  TIMEOUT (>{timeout}s)", "ERROR")
                 "error": str(e)
             })
             return False
+    # ── Test Suite ────────────────────────────────────────────────
     def run_tests(self) -> bool:
         """Run all endpoint tests."""
+        self.log("Starting endpoint tests...", "INFO")
         print("\n" + "="*70)
         # Test 1: Root endpoint
         self.test_endpoint("GET", "/", description="Root Endpoint")
         # Test 2: Health check
         self.test_endpoint("GET", "/health", description="Health Check")
         # Test 3: Root with query params (HF Spaces probe)
         self.test_endpoint("GET", "/?logs=container", description="Root with Logs Query")
         # Test 4: List tasks
         self.test_endpoint("GET", "/tasks", description="List Available Tasks")
         # Test 5: Reset environment
         reset_result = self.test_endpoint(
+            "POST",
+            "/reset",
             data={"task_name": None},
             description="Reset Environment (Start Episode)"
         )
         # Test 6: Get state
         self.test_endpoint("GET", "/state", description="Get Current State")
+        # Test 7: Ask clarification (progressive revelation test)
         if reset_result:
+            # Level 1 - vague question
             self.test_endpoint(
                 "POST",
                 "/step",
                 data={
                     "action_type": "ask_clarification",
+                    "content": "What are the working hours?"
                 },
+                description="Step: Ask Clarification (Level 1 - vague)"
             )
+            # Level 3 - specific compound question
+            self.test_endpoint(
+                "POST",
+                "/step",
+                data={
+                    "action_type": "ask_clarification",
+                    "content": "What happens at the working hours boundary, exactly at hour 18?"
+                },
+                description="Step: Ask Clarification (Level 3 - precise)"
+            )
+        # Test 8: Propose rules (valid DSL)
+        if reset_result:
+            self.test_endpoint(
+                "POST",
+                "/step",
+                data={
+                    "action_type": "propose_rules",
+                    "content": json.dumps({
+                        "rules": [
+                            {
+                                "if": [
+                                    {"field": "data_type", "op": "==", "value": "public"}
+                                ],
+                                "then": "ALLOW"
+                            },
+                            {
+                                "if": [
+                                    {"field": "time", "op": ">=", "value": 9},
+                                    {"field": "time", "op": "<", "value": 18}
+                                ],
+                                "then": "ALLOW"
+                            }
+                        ],
+                        "default": "DENY"
+                    })
+                },
+                description="Step: Propose Rules (Valid DSL)"
+            )
+        # Test 9: Final state
+        self.test_endpoint("GET", "/state", description="Get Final State After Steps")
         print("="*70 + "\n")
         return self.failed == 0
+    # ── Report Generation ─────────────────────────────────────────
     def generate_report(self):
         """Generate and print test report."""
         duration = (datetime.now() - self.start_time).total_seconds()
         total = self.passed + self.failed
         success_rate = 100 * self.passed / total if total > 0 else 0
         print("\n" + "="*70)
         print("📊 HF SPACES TEST REPORT")
         print("="*70)
         print(f"Space URL:      {self.base_url}")
+        print(f"Web URL:        {HF_SPACE_WEB_URL}")
         print(f"Timestamp:      {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
         print(f"Duration:       {duration:.2f}s")
         print(f"Total Tests:    {total}")
         print(f"Failed:         {self.failed} ❌")
         print(f"Success Rate:   {success_rate:.1f}%")
         print("="*70)
         print("\n📋 DETAILED RESULTS:\n")
         for i, result in enumerate(self.results, 1):
             status_icon = "✅" if result["success"] else "❌"
             print(f"   URL:      {result['url']}")
             if result['status_code']:
                 print(f"   Status:   {result['status_code']}")
             if result['error']:
                 print(f"   Error:    {result['error']}")
             elif result['response']:
                 response_preview = result['response']
                 if isinstance(response_preview, dict):
+                    response_preview = json.dumps(response_preview, indent=4)[:300]
                 else:
+                    response_preview = str(response_preview)[:300]
                 print(f"   Response: {response_preview}...")
             print()
         print("="*70)
         if self.failed == 0:
             print("🎉 ALL TESTS PASSED - HF SPACES IS RUNNING!")
         else:
             print(f"⚠️  {self.failed} test(s) failed. Check details above.")
         print("="*70 + "\n")
         return self.failed == 0
+    # ── Main Runner ───────────────────────────────────────────────
     def run(self) -> bool:
         """Run the entire test suite."""
         print("\n🚀 Policy-to-Logic RL Environment - HF Spaces Test Suite\n")
+        self.log(f"Target API:  {self.base_url}", "INFO")
+        self.log(f"Source URL:  {HF_SPACE_WEB_URL}", "INFO")
+        # Step 1: Connectivity check
+        if not self.check_connectivity():
+            print("\n❌ Connectivity check FAILED. Cannot proceed with tests.")
+            print(f"   Verify the space is running at: {HF_SPACE_WEB_URL}")
+            print(f"   Expected API endpoint: {self.base_url}")
+            return False
+        print()
+        # Step 2: Run endpoint tests
         all_passed = self.run_tests()
+        # Step 3: Generate report
         self.generate_report()
         return all_passed
 def main():
     runner = HFSpacesTestRunner(BASE_URL)
     try:
         exit(1)
     except Exception as e:
         print(f"\n❌ Unexpected error: {e}")
+        import traceback
+        traceback.print_exc()
         exit(1)
 if __name__ == "__main__":