Godreign-Y commited on
Commit
8942e3d
Β·
1 Parent(s): 7b82b54

after redesign with claude

Browse files
policy_to_logic_env/server/ground_truth.py CHANGED
@@ -133,11 +133,23 @@ def _ground_truth_transaction_approval(s: dict) -> str:
133
 
134
  def answer_clarification(task_name: str, question: str) -> str:
135
  """
136
- Deterministic clarification oracle.
137
 
138
- Matches question text against known keywords for the given task
139
- and returns a structured answer. If no match is found, returns
140
- a generic response.
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  Args:
143
  task_name: Current task name
@@ -149,22 +161,28 @@ def answer_clarification(task_name: str, question: str) -> str:
149
  task = get_task(task_name)
150
  question_lower = question.lower().strip()
151
 
152
- # Check each keyword in the clarification map
153
  best_match = None
154
- best_match_len = 0
155
 
156
  for keyword, answer in task.clarification_map.items():
157
- if keyword.lower() in question_lower:
158
- # Prefer longer keyword matches (more specific)
159
- if len(keyword) > best_match_len:
 
 
 
 
 
 
160
  best_match = answer
161
- best_match_len = len(keyword)
162
 
163
  if best_match:
164
  return best_match
165
 
166
  return (
167
  "I can provide information about the specific terms and parameters "
168
- "mentioned in the policy. Please ask about a specific aspect such as "
169
- "time constraints, roles, thresholds, or document/data types."
 
170
  )
 
133
 
134
  def answer_clarification(task_name: str, question: str) -> str:
135
  """
136
+ Deterministic clarification oracle with progressive revelation.
137
 
138
+ Uses compound keyword matching to provide layered answers:
139
+ - Vague questions (match short keywords) β†’ partial, potentially
140
+ ambiguous truths that may mislead if taken at face value.
141
+ - Specific questions (match long/compound keywords) β†’ precise,
142
+ ground-truth-aligned answers.
143
+
144
+ Compound keywords: if a keyword contains spaces, ALL space-separated
145
+ words must appear anywhere in the question (order-independent).
146
+ More matched keywords = higher priority (more specific answer wins).
147
+
148
+ This design supports RL training where agents must learn to:
149
+ 1. Detect ambiguity in initial policy text
150
+ 2. Ask targeted questions to resolve ambiguity
151
+ 3. Recognize when earlier (vague) answers were misleading
152
+ 4. Reconcile contradictory signals by drilling deeper
153
 
154
  Args:
155
  task_name: Current task name
 
161
  task = get_task(task_name)
162
  question_lower = question.lower().strip()
163
 
 
164
  best_match = None
165
+ best_match_score = (0, 0) # (num_parts, total_length)
166
 
167
  for keyword, answer in task.clarification_map.items():
168
+ keyword_lower = keyword.lower()
169
+ keyword_parts = keyword_lower.split()
170
+
171
+ # ALL parts of the keyword must appear in the question
172
+ if all(part in question_lower for part in keyword_parts):
173
+ # Score: more keyword parts = more specific = higher priority
174
+ # Tiebreak by total keyword length
175
+ score = (len(keyword_parts), len(keyword_lower))
176
+ if score > best_match_score:
177
  best_match = answer
178
+ best_match_score = score
179
 
180
  if best_match:
181
  return best_match
182
 
183
  return (
184
  "I can provide information about the specific terms and parameters "
185
+ "mentioned in the policy. Try asking about specific aspects like "
186
+ "time boundaries, exact thresholds, role-specific permissions, "
187
+ "or how specific edge cases are handled."
188
  )
policy_to_logic_env/server/policies.py CHANGED
@@ -54,12 +54,70 @@ DATA_ACCESS = TaskConfig(
54
  "work_end": 18,
55
  },
56
  clarification_map={
57
- "working hours": "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format).",
58
- "work hours": "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format).",
59
- "sensitive": "Sensitive data includes personal records, financial data, and proprietary information.",
60
- "internal": "Internal data follows the same access rules as sensitive data.",
61
- "public": "Public data has no access restrictions and can be accessed at any time.",
62
- "after hours": "After hours means any time before 9:00 AM or after 6:00 PM (before 9 or after 18).",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  },
64
  max_steps=5,
65
  scenario_count=30,
@@ -89,15 +147,98 @@ RESOURCE_ACCESS = TaskConfig(
89
  "business_end": 17,
90
  },
91
  clarification_map={
92
- "business hours": "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format).",
93
- "work hours": "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format).",
94
- "junior": "Junior employees are entry-level staff. They can access public and internal documents during business hours, but not confidential documents outside business hours.",
95
- "senior": "Senior employees have unrestricted access to all documents at all times.",
96
- "contractor": "Contractors can only access public documents. They cannot access internal or confidential documents at any time.",
97
- "confidential": "Confidential documents include board minutes, salary data, and strategic plans.",
98
- "internal": "Internal documents include team wikis, project plans, and internal communications.",
99
- "public": "Public documents include published reports, press releases, and public-facing content.",
100
- "outside business": "Outside business hours means before 8:00 AM or after 5:00 PM.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  },
102
  max_steps=7,
103
  scenario_count=50,
@@ -132,21 +273,131 @@ TRANSACTION_APPROVAL = TaskConfig(
132
  "business_end": 17,
133
  },
134
  clarification_map={
135
- "standard limit": "The standard transaction limit is $5,000. Transactions above this amount require manager approval.",
136
- "limit": "The standard transaction limit is $5,000.",
137
- "threshold": "The standard limit is $5,000. The high-value threshold for domestic transactions is $10,000.",
138
- "high-value": "High-value domestic transactions are those with an amount of $10,000 or more.",
139
- "high value": "High-value domestic transactions are those with an amount of $10,000 or more.",
140
- "business hours": "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format).",
141
- "work hours": "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format).",
142
- "international": "All international transfers require compliance review, regardless of amount or time.",
143
- "manager": "Manager-initiated transactions are exempt from the standard limit requirement. They are auto-approved for domestic transactions unless high-value and outside business hours.",
144
- "compliance": "Compliance review is required for all international transfers. It is a separate process from manager approval.",
145
- "routine": "Routine transactions are domestic transactions within the standard limit ($5,000 or less).",
146
- "exempt": "Manager-initiated transactions are exempt from the standard limit. However, they still follow high-value and international rules.",
147
- "non-business": "Non-business hours means before 9:00 AM or after 5:00 PM.",
148
- "system": "System-initiated transactions follow the same rules as employee-initiated ones.",
149
- "domestic": "Domestic transactions that are within limits are auto-approved. High-value ones outside business hours are held.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  },
151
  max_steps=7,
152
  scenario_count=80,
 
54
  "work_end": 18,
55
  },
56
  clarification_map={
57
+ # ── Level 1: General (single short keyword β†’ partial/ambiguous) ──
58
+ "hours": (
59
+ "Working hours are from 9 AM to 6 PM."
60
+ ),
61
+ "sensitive": (
62
+ "Sensitive data includes personal records, financial data, "
63
+ "and proprietary information."
64
+ ),
65
+ "internal": (
66
+ "Internal data follows the same access rules as sensitive data."
67
+ ),
68
+ "public": (
69
+ "Public data has no access restrictions and can be accessed "
70
+ "at any time."
71
+ ),
72
+ "access": (
73
+ "Access depends on the data type and the current hour. "
74
+ "Public data is unrestricted. Other types have time-based rules."
75
+ ),
76
+
77
+ # ── Level 2: Medium specificity (common phrases β†’ more detail) ──
78
+ "working hours": (
79
+ "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format). "
80
+ "Sensitive and internal data can only be accessed during this window."
81
+ ),
82
+ "work hours": (
83
+ "Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format)."
84
+ ),
85
+ "after hours": (
86
+ "After hours means outside the working hours window. "
87
+ "This includes early morning hours and evening hours from 6 PM onward."
88
+ ),
89
+
90
+ # ── Level 3: Precise (compound/specific β†’ ground truth aligned) ──
91
+ "working hours boundary": (
92
+ "Working hours use a half-open interval: hour >= 9 AND hour < 18. "
93
+ "Hour 9 is the first working hour. Hour 17 is the last working hour. "
94
+ "Hour 18 (6:00 PM) is NOT within working hours β€” it is the start of "
95
+ "after-hours."
96
+ ),
97
+ "exactly 18": (
98
+ "Hour 18 (6:00 PM) is considered after hours. The working hours "
99
+ "window ends BEFORE 18, so 18 is outside. Access to sensitive "
100
+ "and internal data is denied at hour 18."
101
+ ),
102
+ "time boundary": (
103
+ "The time boundaries are strict: working hours are "
104
+ "hour >= 9 AND hour < 18. Hour 9 is inside working hours, "
105
+ "hour 18 is outside. The last valid working hour is 17."
106
+ ),
107
+ "sensitive time": (
108
+ "Sensitive data can only be accessed when the hour is >= 9 AND "
109
+ "strictly less than 18. At hour 18, access is denied."
110
+ ),
111
+ "internal time": (
112
+ "Internal data follows the exact same time rules as sensitive "
113
+ "data: allowed when hour >= 9 AND hour < 18."
114
+ ),
115
+ "deny allow": (
116
+ "The decision is ALLOW for public data at any time, or for "
117
+ "sensitive/internal data during hours 9 through 17 (inclusive). "
118
+ "The decision is DENY for sensitive/internal data at hours 0-8 "
119
+ "and 18-23."
120
+ ),
121
  },
122
  max_steps=5,
123
  scenario_count=30,
 
147
  "business_end": 17,
148
  },
149
  clarification_map={
150
+ # ── Level 1: General (single keyword β†’ partial/ambiguous truths) ──
151
+ # NOTE: "junior" answer is technically true but intentionally
152
+ # incomplete β€” it only mentions the "outside business hours"
153
+ # restriction, which can mislead the agent into thinking
154
+ # confidential IS accessible during business hours.
155
+ "junior": (
156
+ "Junior employees are entry-level staff. They can access public "
157
+ "and internal documents during business hours, but not confidential "
158
+ "documents outside business hours."
159
+ ),
160
+ "senior": (
161
+ "Senior employees have unrestricted access to all documents "
162
+ "at all times."
163
+ ),
164
+ "contractor": (
165
+ "Contractors can only access public documents. They cannot access "
166
+ "internal or confidential documents at any time."
167
+ ),
168
+ "confidential": (
169
+ "Confidential documents include board minutes, salary data, and "
170
+ "strategic plans. Access is highly restricted."
171
+ ),
172
+ "internal": (
173
+ "Internal documents include team wikis, project plans, and "
174
+ "internal communications."
175
+ ),
176
+ "public": (
177
+ "Public documents include published reports, press releases, and "
178
+ "public-facing content. No access restrictions."
179
+ ),
180
+ "hours": (
181
+ "Business hours are 8 AM to 5 PM."
182
+ ),
183
+
184
+ # ── Level 2: Medium specificity (common phrases β†’ more detail) ──
185
+ "business hours": (
186
+ "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format). "
187
+ "Access permissions change based on whether the current hour falls "
188
+ "within this range."
189
+ ),
190
+ "work hours": (
191
+ "Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format)."
192
+ ),
193
+ "outside business": (
194
+ "Outside business hours includes early morning and evening. "
195
+ "Restrictions are tighter outside this window for junior staff."
196
+ ),
197
+
198
+ # ── Level 3: Precise (compound keywords β†’ ground truth aligned) ──
199
+ # These answers reveal the FULL truth, correcting any misleading
200
+ # impressions from Level 1 answers.
201
+ "junior confidential": (
202
+ "Junior employees CANNOT access confidential documents at ANY time, "
203
+ "regardless of whether it is during or outside business hours. "
204
+ "The policy statement about 'outside business hours' is a minimum "
205
+ "restriction β€” the actual rule is a blanket denial of confidential "
206
+ "access for juniors."
207
+ ),
208
+ "junior internal": (
209
+ "Junior employees can access internal documents ONLY during business "
210
+ "hours (hour >= 8 AND hour < 17). Outside business hours, internal "
211
+ "documents are denied for juniors."
212
+ ),
213
+ "junior public": (
214
+ "Junior employees can access public documents at any time. "
215
+ "Public access has no restrictions for any role."
216
+ ),
217
+ "business hours boundary": (
218
+ "Business hours use a half-open interval: hour >= 8 AND hour < 17. "
219
+ "Hour 8 is the first business hour. Hour 16 is the last business hour. "
220
+ "Hour 17 (5:00 PM) is NOT within business hours."
221
+ ),
222
+ "exactly 17": (
223
+ "Hour 17 (5:00 PM) is considered outside business hours. "
224
+ "The business hours window ends BEFORE 17. Junior employees lose "
225
+ "access to internal documents at hour 17."
226
+ ),
227
+ "time boundary": (
228
+ "Business hours are hour >= 8 AND hour < 17. "
229
+ "Hour 8 is inside, hour 17 is outside. "
230
+ "The last valid business hour is 16."
231
+ ),
232
+ "confidential during": (
233
+ "Confidential documents are NOT accessible to junior employees "
234
+ "during business hours. The policy only explicitly mentions the "
235
+ "'outside business hours' restriction, but the actual rule denies "
236
+ "junior access to confidential at all times."
237
+ ),
238
+ "contractor internal": (
239
+ "Contractors cannot access internal documents. They are restricted "
240
+ "to public documents only, regardless of time."
241
+ ),
242
  },
243
  max_steps=7,
244
  scenario_count=50,
 
273
  "business_end": 17,
274
  },
275
  clarification_map={
276
+ # ── Level 1: General (single keyword β†’ partial/vague) ──
277
+ "limit": (
278
+ "The standard transaction limit is $5,000."
279
+ ),
280
+ "international": (
281
+ "All international transfers require compliance review, "
282
+ "regardless of amount or time."
283
+ ),
284
+ "manager": (
285
+ "Manager-initiated transactions are exempt from the standard "
286
+ "limit requirement. They are auto-approved for domestic "
287
+ "transactions unless other rules apply."
288
+ ),
289
+ "domestic": (
290
+ "Domestic transactions follow different rules based on amount "
291
+ "and time of day."
292
+ ),
293
+ "compliance": (
294
+ "Compliance review is required for all international transfers. "
295
+ "It is a separate process from manager approval."
296
+ ),
297
+ "routine": (
298
+ "Routine transactions are domestic transactions within the "
299
+ "standard limit."
300
+ ),
301
+ "system": (
302
+ "System-initiated transactions follow the same rules as "
303
+ "employee-initiated ones."
304
+ ),
305
+ "hours": (
306
+ "Business hours are 9 AM to 5 PM."
307
+ ),
308
+ "exempt": (
309
+ "Manager-initiated transactions are exempt from the standard "
310
+ "limit. However, other rules may still apply."
311
+ ),
312
+
313
+ # ── Level 2: Medium specificity ──
314
+ "standard limit": (
315
+ "The standard transaction limit is $5,000. Transactions above "
316
+ "this amount require manager approval unless the initiator is "
317
+ "a manager."
318
+ ),
319
+ "high-value": (
320
+ "High-value domestic transactions are those with an amount of "
321
+ "$10,000 or more."
322
+ ),
323
+ "high value": (
324
+ "High-value domestic transactions are those with an amount of "
325
+ "$10,000 or more."
326
+ ),
327
+ "threshold": (
328
+ "The standard limit is $5,000. The high-value threshold for "
329
+ "domestic transactions is $10,000."
330
+ ),
331
+ "business hours": (
332
+ "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format)."
333
+ ),
334
+ "work hours": (
335
+ "Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format)."
336
+ ),
337
+ "non-business": (
338
+ "Non-business hours means outside the 9 AM to 5 PM window, "
339
+ "including the 5 PM hour itself."
340
+ ),
341
+
342
+ # ── Level 3: Precise (compound keywords β†’ ground truth) ──
343
+ "exactly 5000": (
344
+ "A transaction of exactly $5,000 is WITHIN the standard limit and "
345
+ "is auto-approved for domestic transactions. Only amounts STRICTLY "
346
+ "above $5,000 (i.e., $5,001+) trigger the approval requirement. "
347
+ "The comparison is amount > 5000, not amount >= 5000."
348
+ ),
349
+ "exactly 10000": (
350
+ "A domestic transaction of exactly $10,000 IS considered high-value. "
351
+ "The threshold is amount >= 10000. A $9,999 transaction is NOT "
352
+ "high-value."
353
+ ),
354
+ "exactly 17": (
355
+ "Hour 17 (5:00 PM) is considered non-business hours. The business "
356
+ "hours window ends BEFORE 17. A high-value domestic transaction at "
357
+ "hour 17 would be held for review."
358
+ ),
359
+ "business hours boundary": (
360
+ "Business hours use a half-open interval: hour >= 9 AND hour < 17. "
361
+ "Hour 9 is business hours. Hour 16 is the last business hour. "
362
+ "Hour 17 (5:00 PM) is NOT business hours."
363
+ ),
364
+ "time boundary": (
365
+ "Business hours are hour >= 9 AND hour < 17. "
366
+ "Hour 9 is inside, hour 17 is outside."
367
+ ),
368
+ "manager exempt": (
369
+ "Manager-initiated transactions are exempt from the standard $5,000 "
370
+ "limit only. They are NOT exempt from international compliance review "
371
+ "or the high-value domestic HOLD rule. A manager's $10,000 domestic "
372
+ "transaction outside business hours is still HELD."
373
+ ),
374
+ "manager high-value": (
375
+ "Manager exemption only applies to the standard limit ($5,000). "
376
+ "Managers are still subject to: (1) COMPLIANCE_REVIEW for international "
377
+ "transfers, and (2) HOLD for high-value domestic transactions (>= $10,000) "
378
+ "outside business hours. The exemption is narrow."
379
+ ),
380
+ "manager international": (
381
+ "Even manager-initiated international transfers require COMPLIANCE_REVIEW. "
382
+ "The manager exemption does not override the international transfer rule."
383
+ ),
384
+ "domestic hold": (
385
+ "A domestic transaction is HELD when: (1) amount >= $10,000 AND "
386
+ "(2) the hour is outside business hours (hour < 9 or hour >= 17). "
387
+ "Both conditions must be true. During business hours, high-value "
388
+ "domestic transactions get REQUIRE_APPROVAL instead (if not manager-initiated)."
389
+ ),
390
+ "rule priority": (
391
+ "Rules are evaluated in priority order: "
392
+ "(1) International transfers β†’ COMPLIANCE_REVIEW always. "
393
+ "(2) High-value domestic (>= $10,000) outside business hours β†’ HOLD. "
394
+ "(3) Above standard limit (> $5,000) and not manager β†’ REQUIRE_APPROVAL. "
395
+ "(4) Everything else β†’ APPROVE."
396
+ ),
397
+ "international manager": (
398
+ "International transfers ALWAYS go to COMPLIANCE_REVIEW, even for "
399
+ "managers. This is the highest-priority rule."
400
+ ),
401
  },
402
  max_steps=7,
403
  scenario_count=80,
test_hf_spaces.py CHANGED
@@ -4,6 +4,7 @@ HF Spaces Test Runner - Policy-to-Logic Environment
4
  Tests all endpoints on the deployed HF Spaces and generates a report.
5
 
6
  Run it:
 
7
  uv run python test_hf_spaces.py
8
  """
9
 
@@ -12,12 +13,16 @@ import json
12
  import time
13
  from typing import Dict, Any, List
14
  from datetime import datetime
15
- from urllib.parse import urljoin
16
 
17
- # HF Spaces URL
18
- HF_SPACE_URL = "https://huggingface.co/spaces/Godreign/Policy2Logic"
19
- # Extract the actual API endpoint
20
- BASE_URL = HF_SPACE_URL
 
 
 
 
 
21
 
22
  class HFSpacesTestRunner:
23
  def __init__(self, base_url: str):
@@ -26,7 +31,7 @@ class HFSpacesTestRunner:
26
  self.passed = 0
27
  self.failed = 0
28
  self.start_time = datetime.now()
29
-
30
  def log(self, message: str, level: str = "INFO"):
31
  """Print formatted log message."""
32
  timestamp = datetime.now().strftime("%H:%M:%S")
@@ -38,22 +43,81 @@ class HFSpacesTestRunner:
38
  "TEST": "πŸ§ͺ"
39
  }
40
  print(f"[{timestamp}] {prefix.get(level, '')} {message}")
41
-
42
- def test_endpoint(self, method: str, endpoint: str, data: dict = None,
43
- description: str = "", timeout: int = 10) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """Test an HF Spaces endpoint and record result."""
45
- url = urljoin(self.base_url, endpoint)
46
  display_name = description or endpoint
47
  self.log(f"Testing: {display_name}", "TEST")
48
-
49
  try:
50
  if method == "POST":
51
  response = requests.post(url, json=data, timeout=timeout)
52
  else:
53
  response = requests.get(url, timeout=timeout)
54
-
55
  success = response.status_code in [200, 201]
56
-
57
  result = {
58
  "test": display_name,
59
  "endpoint": endpoint,
@@ -64,14 +128,14 @@ class HFSpacesTestRunner:
64
  "response": None,
65
  "error": None
66
  }
67
-
68
  try:
69
  result["response"] = response.json()
70
- except:
71
  result["response"] = response.text[:300]
72
-
73
  self.results.append(result)
74
-
75
  if success:
76
  self.passed += 1
77
  self.log(f" Status: {response.status_code} - PASSED", "SUCCESS")
@@ -79,9 +143,9 @@ class HFSpacesTestRunner:
79
  self.failed += 1
80
  self.log(f" Status: {response.status_code} - FAILED", "ERROR")
81
  result["error"] = response.text[:200]
82
-
83
  return success
84
-
85
  except requests.exceptions.Timeout:
86
  self.failed += 1
87
  self.log(f" TIMEOUT (>{timeout}s)", "ERROR")
@@ -124,63 +188,107 @@ class HFSpacesTestRunner:
124
  "error": str(e)
125
  })
126
  return False
127
-
 
128
  def run_tests(self) -> bool:
129
  """Run all endpoint tests."""
130
- self.log("Starting HF Spaces test suite...", "INFO")
131
  print("\n" + "="*70)
132
-
133
  # Test 1: Root endpoint
134
  self.test_endpoint("GET", "/", description="Root Endpoint")
135
-
136
  # Test 2: Health check
137
  self.test_endpoint("GET", "/health", description="Health Check")
138
-
139
  # Test 3: Root with query params (HF Spaces probe)
140
  self.test_endpoint("GET", "/?logs=container", description="Root with Logs Query")
141
-
142
  # Test 4: List tasks
143
  self.test_endpoint("GET", "/tasks", description="List Available Tasks")
144
-
145
  # Test 5: Reset environment
146
  reset_result = self.test_endpoint(
147
- "POST",
148
- "/reset",
149
  data={"task_name": None},
150
  description="Reset Environment (Start Episode)"
151
  )
152
-
153
  # Test 6: Get state
154
  self.test_endpoint("GET", "/state", description="Get Current State")
155
-
156
- # Test 7: Ask clarification
157
  if reset_result:
 
158
  self.test_endpoint(
159
  "POST",
160
  "/step",
161
  data={
162
  "action_type": "ask_clarification",
163
- "content": "What are the business hours?"
164
  },
165
- description="Step: Ask Clarification"
166
  )
167
-
168
- # Test 8: Final state
169
- self.test_endpoint("GET", "/state", description="Get Final State After Step")
170
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
  print("="*70 + "\n")
172
  return self.failed == 0
173
-
 
174
  def generate_report(self):
175
  """Generate and print test report."""
176
  duration = (datetime.now() - self.start_time).total_seconds()
177
  total = self.passed + self.failed
178
  success_rate = 100 * self.passed / total if total > 0 else 0
179
-
180
  print("\n" + "="*70)
181
  print("πŸ“Š HF SPACES TEST REPORT")
182
  print("="*70)
183
  print(f"Space URL: {self.base_url}")
 
184
  print(f"Timestamp: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
185
  print(f"Duration: {duration:.2f}s")
186
  print(f"Total Tests: {total}")
@@ -188,7 +296,7 @@ class HFSpacesTestRunner:
188
  print(f"Failed: {self.failed} ❌")
189
  print(f"Success Rate: {success_rate:.1f}%")
190
  print("="*70)
191
-
192
  print("\nπŸ“‹ DETAILED RESULTS:\n")
193
  for i, result in enumerate(self.results, 1):
194
  status_icon = "βœ…" if result["success"] else "❌"
@@ -197,41 +305,53 @@ class HFSpacesTestRunner:
197
  print(f" URL: {result['url']}")
198
  if result['status_code']:
199
  print(f" Status: {result['status_code']}")
200
-
201
  if result['error']:
202
  print(f" Error: {result['error']}")
203
  elif result['response']:
204
  response_preview = result['response']
205
  if isinstance(response_preview, dict):
206
- response_preview = json.dumps(response_preview, indent=4)[:200]
207
  else:
208
- response_preview = str(response_preview)[:200]
209
  print(f" Response: {response_preview}...")
210
  print()
211
-
212
  print("="*70)
213
  if self.failed == 0:
214
  print("πŸŽ‰ ALL TESTS PASSED - HF SPACES IS RUNNING!")
215
  else:
216
  print(f"⚠️ {self.failed} test(s) failed. Check details above.")
217
  print("="*70 + "\n")
218
-
219
  return self.failed == 0
220
-
 
221
  def run(self) -> bool:
222
  """Run the entire test suite."""
223
  print("\nπŸš€ Policy-to-Logic RL Environment - HF Spaces Test Suite\n")
224
-
225
- self.log(f"Target: {self.base_url}", "INFO")
226
-
227
- # Run tests
 
 
 
 
 
 
 
 
 
 
228
  all_passed = self.run_tests()
229
-
230
- # Generate report
231
  self.generate_report()
232
-
233
  return all_passed
234
 
 
235
  def main():
236
  runner = HFSpacesTestRunner(BASE_URL)
237
  try:
@@ -242,6 +362,8 @@ def main():
242
  exit(1)
243
  except Exception as e:
244
  print(f"\n❌ Unexpected error: {e}")
 
 
245
  exit(1)
246
 
247
  if __name__ == "__main__":
 
4
  Tests all endpoints on the deployed HF Spaces and generates a report.
5
 
6
  Run it:
7
+ $env:UV_LINK_MODE="copy"
8
  uv run python test_hf_spaces.py
9
  """
10
 
 
13
  import time
14
  from typing import Dict, Any, List
15
  from datetime import datetime
 
16
 
17
+ # ─── URL Construction ─────────────────────────────────────────────
18
+ # HF Spaces URL β†’ API endpoint
19
+ # https://huggingface.co/spaces/{user}/{repo} β†’ https://{user}-{repo}.hf.space
20
+ HF_SPACE_WEB_URL = "https://huggingface.co/spaces/Godreign/Policy2Logic"
21
+ parts = HF_SPACE_WEB_URL.split('/')
22
+ username = parts[-2] # "Godreign"
23
+ repo_name = parts[-1] # "Policy2Logic"
24
+ BASE_URL = f"https://{username.lower()}-{repo_name.lower()}.hf.space"
25
+
26
 
27
  class HFSpacesTestRunner:
28
  def __init__(self, base_url: str):
 
31
  self.passed = 0
32
  self.failed = 0
33
  self.start_time = datetime.now()
34
+
35
  def log(self, message: str, level: str = "INFO"):
36
  """Print formatted log message."""
37
  timestamp = datetime.now().strftime("%H:%M:%S")
 
43
  "TEST": "πŸ§ͺ"
44
  }
45
  print(f"[{timestamp}] {prefix.get(level, '')} {message}")
46
+
47
+ # ── Connectivity Check ────────────────────────────────────────
48
+ def check_connectivity(self) -> bool:
49
+ """
50
+ Verify we can reach the HF Space before running tests.
51
+ Returns True if the space is reachable and responding.
52
+ """
53
+ print(f"\nπŸ”— Connectivity Check")
54
+ print(f" Target URL: {self.base_url}")
55
+ print()
56
+
57
+ try:
58
+ # First, check with allow_redirects=False to detect proxy issues
59
+ resp = requests.get(
60
+ self.base_url,
61
+ timeout=15,
62
+ allow_redirects=False,
63
+ )
64
+ print(f" Direct response: status={resp.status_code}")
65
+
66
+ if resp.is_redirect or resp.is_permanent_redirect:
67
+ redirect_url = resp.headers.get("Location", "unknown")
68
+ print(f" ⚠️ REDIRECT detected β†’ {redirect_url}")
69
+ print(f" The space may not be running or the URL format changed.")
70
+ print(f" Expected API base: {self.base_url}")
71
+ return False
72
+
73
+ # Now check with redirects allowed (normal mode)
74
+ resp = requests.get(self.base_url, timeout=15)
75
+ if resp.status_code == 200:
76
+ try:
77
+ data = resp.json()
78
+ if data.get("status") == "running":
79
+ print(f" βœ… Space is RUNNING")
80
+ print(f" Response: {json.dumps(data, indent=2)[:200]}")
81
+ return True
82
+ else:
83
+ print(f" ⚠️ Unexpected response: {data}")
84
+ return True # Still reachable
85
+ except ValueError:
86
+ print(f" ⚠️ Non-JSON response (may be HF loading page)")
87
+ print(f" Body preview: {resp.text[:200]}")
88
+ return False
89
+ else:
90
+ print(f" ❌ Got status {resp.status_code}")
91
+ print(f" Body: {resp.text[:200]}")
92
+ return False
93
+
94
+ except requests.exceptions.Timeout:
95
+ print(f" ❌ Connection TIMEOUT (>15s)")
96
+ print(f" The space may be sleeping. Visit {HF_SPACE_WEB_URL} to wake it.")
97
+ return False
98
+ except requests.exceptions.ConnectionError as e:
99
+ print(f" ❌ Connection FAILED: {str(e)[:150]}")
100
+ return False
101
+ except Exception as e:
102
+ print(f" ❌ Unexpected error: {e}")
103
+ return False
104
+
105
+ # ── Endpoint Testing ──────────────────────────────────────────
106
+ def test_endpoint(self, method: str, endpoint: str, data: dict = None,
107
+ description: str = "", timeout: int = 15) -> bool:
108
  """Test an HF Spaces endpoint and record result."""
109
+ url = f"{self.base_url}{endpoint}"
110
  display_name = description or endpoint
111
  self.log(f"Testing: {display_name}", "TEST")
112
+
113
  try:
114
  if method == "POST":
115
  response = requests.post(url, json=data, timeout=timeout)
116
  else:
117
  response = requests.get(url, timeout=timeout)
118
+
119
  success = response.status_code in [200, 201]
120
+
121
  result = {
122
  "test": display_name,
123
  "endpoint": endpoint,
 
128
  "response": None,
129
  "error": None
130
  }
131
+
132
  try:
133
  result["response"] = response.json()
134
+ except ValueError:
135
  result["response"] = response.text[:300]
136
+
137
  self.results.append(result)
138
+
139
  if success:
140
  self.passed += 1
141
  self.log(f" Status: {response.status_code} - PASSED", "SUCCESS")
 
143
  self.failed += 1
144
  self.log(f" Status: {response.status_code} - FAILED", "ERROR")
145
  result["error"] = response.text[:200]
146
+
147
  return success
148
+
149
  except requests.exceptions.Timeout:
150
  self.failed += 1
151
  self.log(f" TIMEOUT (>{timeout}s)", "ERROR")
 
188
  "error": str(e)
189
  })
190
  return False
191
+
192
+ # ── Test Suite ────────────────────────────────────────────────
193
  def run_tests(self) -> bool:
194
  """Run all endpoint tests."""
195
+ self.log("Starting endpoint tests...", "INFO")
196
  print("\n" + "="*70)
197
+
198
  # Test 1: Root endpoint
199
  self.test_endpoint("GET", "/", description="Root Endpoint")
200
+
201
  # Test 2: Health check
202
  self.test_endpoint("GET", "/health", description="Health Check")
203
+
204
  # Test 3: Root with query params (HF Spaces probe)
205
  self.test_endpoint("GET", "/?logs=container", description="Root with Logs Query")
206
+
207
  # Test 4: List tasks
208
  self.test_endpoint("GET", "/tasks", description="List Available Tasks")
209
+
210
  # Test 5: Reset environment
211
  reset_result = self.test_endpoint(
212
+ "POST",
213
+ "/reset",
214
  data={"task_name": None},
215
  description="Reset Environment (Start Episode)"
216
  )
217
+
218
  # Test 6: Get state
219
  self.test_endpoint("GET", "/state", description="Get Current State")
220
+
221
+ # Test 7: Ask clarification (progressive revelation test)
222
  if reset_result:
223
+ # Level 1 - vague question
224
  self.test_endpoint(
225
  "POST",
226
  "/step",
227
  data={
228
  "action_type": "ask_clarification",
229
+ "content": "What are the working hours?"
230
  },
231
+ description="Step: Ask Clarification (Level 1 - vague)"
232
  )
233
+
234
+ # Level 3 - specific compound question
235
+ self.test_endpoint(
236
+ "POST",
237
+ "/step",
238
+ data={
239
+ "action_type": "ask_clarification",
240
+ "content": "What happens at the working hours boundary, exactly at hour 18?"
241
+ },
242
+ description="Step: Ask Clarification (Level 3 - precise)"
243
+ )
244
+
245
+ # Test 8: Propose rules (valid DSL)
246
+ if reset_result:
247
+ self.test_endpoint(
248
+ "POST",
249
+ "/step",
250
+ data={
251
+ "action_type": "propose_rules",
252
+ "content": json.dumps({
253
+ "rules": [
254
+ {
255
+ "if": [
256
+ {"field": "data_type", "op": "==", "value": "public"}
257
+ ],
258
+ "then": "ALLOW"
259
+ },
260
+ {
261
+ "if": [
262
+ {"field": "time", "op": ">=", "value": 9},
263
+ {"field": "time", "op": "<", "value": 18}
264
+ ],
265
+ "then": "ALLOW"
266
+ }
267
+ ],
268
+ "default": "DENY"
269
+ })
270
+ },
271
+ description="Step: Propose Rules (Valid DSL)"
272
+ )
273
+
274
+ # Test 9: Final state
275
+ self.test_endpoint("GET", "/state", description="Get Final State After Steps")
276
+
277
  print("="*70 + "\n")
278
  return self.failed == 0
279
+
280
+ # ── Report Generation ─────────────────────────────────────────
281
  def generate_report(self):
282
  """Generate and print test report."""
283
  duration = (datetime.now() - self.start_time).total_seconds()
284
  total = self.passed + self.failed
285
  success_rate = 100 * self.passed / total if total > 0 else 0
286
+
287
  print("\n" + "="*70)
288
  print("πŸ“Š HF SPACES TEST REPORT")
289
  print("="*70)
290
  print(f"Space URL: {self.base_url}")
291
+ print(f"Web URL: {HF_SPACE_WEB_URL}")
292
  print(f"Timestamp: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
293
  print(f"Duration: {duration:.2f}s")
294
  print(f"Total Tests: {total}")
 
296
  print(f"Failed: {self.failed} ❌")
297
  print(f"Success Rate: {success_rate:.1f}%")
298
  print("="*70)
299
+
300
  print("\nπŸ“‹ DETAILED RESULTS:\n")
301
  for i, result in enumerate(self.results, 1):
302
  status_icon = "βœ…" if result["success"] else "❌"
 
305
  print(f" URL: {result['url']}")
306
  if result['status_code']:
307
  print(f" Status: {result['status_code']}")
308
+
309
  if result['error']:
310
  print(f" Error: {result['error']}")
311
  elif result['response']:
312
  response_preview = result['response']
313
  if isinstance(response_preview, dict):
314
+ response_preview = json.dumps(response_preview, indent=4)[:300]
315
  else:
316
+ response_preview = str(response_preview)[:300]
317
  print(f" Response: {response_preview}...")
318
  print()
319
+
320
  print("="*70)
321
  if self.failed == 0:
322
  print("πŸŽ‰ ALL TESTS PASSED - HF SPACES IS RUNNING!")
323
  else:
324
  print(f"⚠️ {self.failed} test(s) failed. Check details above.")
325
  print("="*70 + "\n")
326
+
327
  return self.failed == 0
328
+
329
+ # ── Main Runner ───────────────────────────────────────────────
330
  def run(self) -> bool:
331
  """Run the entire test suite."""
332
  print("\nπŸš€ Policy-to-Logic RL Environment - HF Spaces Test Suite\n")
333
+
334
+ self.log(f"Target API: {self.base_url}", "INFO")
335
+ self.log(f"Source URL: {HF_SPACE_WEB_URL}", "INFO")
336
+
337
+ # Step 1: Connectivity check
338
+ if not self.check_connectivity():
339
+ print("\n❌ Connectivity check FAILED. Cannot proceed with tests.")
340
+ print(f" Verify the space is running at: {HF_SPACE_WEB_URL}")
341
+ print(f" Expected API endpoint: {self.base_url}")
342
+ return False
343
+
344
+ print()
345
+
346
+ # Step 2: Run endpoint tests
347
  all_passed = self.run_tests()
348
+
349
+ # Step 3: Generate report
350
  self.generate_report()
351
+
352
  return all_passed
353
 
354
+
355
  def main():
356
  runner = HFSpacesTestRunner(BASE_URL)
357
  try:
 
362
  exit(1)
363
  except Exception as e:
364
  print(f"\n❌ Unexpected error: {e}")
365
+ import traceback
366
+ traceback.print_exc()
367
  exit(1)
368
 
369
  if __name__ == "__main__":