Spaces:
Running
Running
Godreign-Y commited on
Commit Β·
8942e3d
1
Parent(s): 7b82b54
after redesign with claude
Browse files- policy_to_logic_env/server/ground_truth.py +30 -12
- policy_to_logic_env/server/policies.py +281 -30
- test_hf_spaces.py +175 -53
policy_to_logic_env/server/ground_truth.py
CHANGED
|
@@ -133,11 +133,23 @@ def _ground_truth_transaction_approval(s: dict) -> str:
|
|
| 133 |
|
| 134 |
def answer_clarification(task_name: str, question: str) -> str:
|
| 135 |
"""
|
| 136 |
-
Deterministic clarification oracle.
|
| 137 |
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
Args:
|
| 143 |
task_name: Current task name
|
|
@@ -149,22 +161,28 @@ def answer_clarification(task_name: str, question: str) -> str:
|
|
| 149 |
task = get_task(task_name)
|
| 150 |
question_lower = question.lower().strip()
|
| 151 |
|
| 152 |
-
# Check each keyword in the clarification map
|
| 153 |
best_match = None
|
| 154 |
-
|
| 155 |
|
| 156 |
for keyword, answer in task.clarification_map.items():
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
best_match = answer
|
| 161 |
-
|
| 162 |
|
| 163 |
if best_match:
|
| 164 |
return best_match
|
| 165 |
|
| 166 |
return (
|
| 167 |
"I can provide information about the specific terms and parameters "
|
| 168 |
-
"mentioned in the policy.
|
| 169 |
-
"time
|
|
|
|
| 170 |
)
|
|
|
|
| 133 |
|
| 134 |
def answer_clarification(task_name: str, question: str) -> str:
|
| 135 |
"""
|
| 136 |
+
Deterministic clarification oracle with progressive revelation.
|
| 137 |
|
| 138 |
+
Uses compound keyword matching to provide layered answers:
|
| 139 |
+
- Vague questions (match short keywords) β partial, potentially
|
| 140 |
+
ambiguous truths that may mislead if taken at face value.
|
| 141 |
+
- Specific questions (match long/compound keywords) β precise,
|
| 142 |
+
ground-truth-aligned answers.
|
| 143 |
+
|
| 144 |
+
Compound keywords: if a keyword contains spaces, ALL space-separated
|
| 145 |
+
words must appear anywhere in the question (order-independent).
|
| 146 |
+
More matched keywords = higher priority (more specific answer wins).
|
| 147 |
+
|
| 148 |
+
This design supports RL training where agents must learn to:
|
| 149 |
+
1. Detect ambiguity in initial policy text
|
| 150 |
+
2. Ask targeted questions to resolve ambiguity
|
| 151 |
+
3. Recognize when earlier (vague) answers were misleading
|
| 152 |
+
4. Reconcile contradictory signals by drilling deeper
|
| 153 |
|
| 154 |
Args:
|
| 155 |
task_name: Current task name
|
|
|
|
| 161 |
task = get_task(task_name)
|
| 162 |
question_lower = question.lower().strip()
|
| 163 |
|
|
|
|
| 164 |
best_match = None
|
| 165 |
+
best_match_score = (0, 0) # (num_parts, total_length)
|
| 166 |
|
| 167 |
for keyword, answer in task.clarification_map.items():
|
| 168 |
+
keyword_lower = keyword.lower()
|
| 169 |
+
keyword_parts = keyword_lower.split()
|
| 170 |
+
|
| 171 |
+
# ALL parts of the keyword must appear in the question
|
| 172 |
+
if all(part in question_lower for part in keyword_parts):
|
| 173 |
+
# Score: more keyword parts = more specific = higher priority
|
| 174 |
+
# Tiebreak by total keyword length
|
| 175 |
+
score = (len(keyword_parts), len(keyword_lower))
|
| 176 |
+
if score > best_match_score:
|
| 177 |
best_match = answer
|
| 178 |
+
best_match_score = score
|
| 179 |
|
| 180 |
if best_match:
|
| 181 |
return best_match
|
| 182 |
|
| 183 |
return (
|
| 184 |
"I can provide information about the specific terms and parameters "
|
| 185 |
+
"mentioned in the policy. Try asking about specific aspects like "
|
| 186 |
+
"time boundaries, exact thresholds, role-specific permissions, "
|
| 187 |
+
"or how specific edge cases are handled."
|
| 188 |
)
|
policy_to_logic_env/server/policies.py
CHANGED
|
@@ -54,12 +54,70 @@ DATA_ACCESS = TaskConfig(
|
|
| 54 |
"work_end": 18,
|
| 55 |
},
|
| 56 |
clarification_map={
|
| 57 |
-
|
| 58 |
-
"
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
"
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
},
|
| 64 |
max_steps=5,
|
| 65 |
scenario_count=30,
|
|
@@ -89,15 +147,98 @@ RESOURCE_ACCESS = TaskConfig(
|
|
| 89 |
"business_end": 17,
|
| 90 |
},
|
| 91 |
clarification_map={
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
"
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
},
|
| 102 |
max_steps=7,
|
| 103 |
scenario_count=50,
|
|
@@ -132,21 +273,131 @@ TRANSACTION_APPROVAL = TaskConfig(
|
|
| 132 |
"business_end": 17,
|
| 133 |
},
|
| 134 |
clarification_map={
|
| 135 |
-
|
| 136 |
-
"limit":
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
"
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
"manager":
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
"
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
},
|
| 151 |
max_steps=7,
|
| 152 |
scenario_count=80,
|
|
|
|
| 54 |
"work_end": 18,
|
| 55 |
},
|
| 56 |
clarification_map={
|
| 57 |
+
# ββ Level 1: General (single short keyword β partial/ambiguous) ββ
|
| 58 |
+
"hours": (
|
| 59 |
+
"Working hours are from 9 AM to 6 PM."
|
| 60 |
+
),
|
| 61 |
+
"sensitive": (
|
| 62 |
+
"Sensitive data includes personal records, financial data, "
|
| 63 |
+
"and proprietary information."
|
| 64 |
+
),
|
| 65 |
+
"internal": (
|
| 66 |
+
"Internal data follows the same access rules as sensitive data."
|
| 67 |
+
),
|
| 68 |
+
"public": (
|
| 69 |
+
"Public data has no access restrictions and can be accessed "
|
| 70 |
+
"at any time."
|
| 71 |
+
),
|
| 72 |
+
"access": (
|
| 73 |
+
"Access depends on the data type and the current hour. "
|
| 74 |
+
"Public data is unrestricted. Other types have time-based rules."
|
| 75 |
+
),
|
| 76 |
+
|
| 77 |
+
# ββ Level 2: Medium specificity (common phrases β more detail) ββ
|
| 78 |
+
"working hours": (
|
| 79 |
+
"Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format). "
|
| 80 |
+
"Sensitive and internal data can only be accessed during this window."
|
| 81 |
+
),
|
| 82 |
+
"work hours": (
|
| 83 |
+
"Working hours are 9:00 AM to 6:00 PM (9 to 18 in 24-hour format)."
|
| 84 |
+
),
|
| 85 |
+
"after hours": (
|
| 86 |
+
"After hours means outside the working hours window. "
|
| 87 |
+
"This includes early morning hours and evening hours from 6 PM onward."
|
| 88 |
+
),
|
| 89 |
+
|
| 90 |
+
# ββ Level 3: Precise (compound/specific β ground truth aligned) ββ
|
| 91 |
+
"working hours boundary": (
|
| 92 |
+
"Working hours use a half-open interval: hour >= 9 AND hour < 18. "
|
| 93 |
+
"Hour 9 is the first working hour. Hour 17 is the last working hour. "
|
| 94 |
+
"Hour 18 (6:00 PM) is NOT within working hours β it is the start of "
|
| 95 |
+
"after-hours."
|
| 96 |
+
),
|
| 97 |
+
"exactly 18": (
|
| 98 |
+
"Hour 18 (6:00 PM) is considered after hours. The working hours "
|
| 99 |
+
"window ends BEFORE 18, so 18 is outside. Access to sensitive "
|
| 100 |
+
"and internal data is denied at hour 18."
|
| 101 |
+
),
|
| 102 |
+
"time boundary": (
|
| 103 |
+
"The time boundaries are strict: working hours are "
|
| 104 |
+
"hour >= 9 AND hour < 18. Hour 9 is inside working hours, "
|
| 105 |
+
"hour 18 is outside. The last valid working hour is 17."
|
| 106 |
+
),
|
| 107 |
+
"sensitive time": (
|
| 108 |
+
"Sensitive data can only be accessed when the hour is >= 9 AND "
|
| 109 |
+
"strictly less than 18. At hour 18, access is denied."
|
| 110 |
+
),
|
| 111 |
+
"internal time": (
|
| 112 |
+
"Internal data follows the exact same time rules as sensitive "
|
| 113 |
+
"data: allowed when hour >= 9 AND hour < 18."
|
| 114 |
+
),
|
| 115 |
+
"deny allow": (
|
| 116 |
+
"The decision is ALLOW for public data at any time, or for "
|
| 117 |
+
"sensitive/internal data during hours 9 through 17 (inclusive). "
|
| 118 |
+
"The decision is DENY for sensitive/internal data at hours 0-8 "
|
| 119 |
+
"and 18-23."
|
| 120 |
+
),
|
| 121 |
},
|
| 122 |
max_steps=5,
|
| 123 |
scenario_count=30,
|
|
|
|
| 147 |
"business_end": 17,
|
| 148 |
},
|
| 149 |
clarification_map={
|
| 150 |
+
# ββ Level 1: General (single keyword β partial/ambiguous truths) ββ
|
| 151 |
+
# NOTE: "junior" answer is technically true but intentionally
|
| 152 |
+
# incomplete β it only mentions the "outside business hours"
|
| 153 |
+
# restriction, which can mislead the agent into thinking
|
| 154 |
+
# confidential IS accessible during business hours.
|
| 155 |
+
"junior": (
|
| 156 |
+
"Junior employees are entry-level staff. They can access public "
|
| 157 |
+
"and internal documents during business hours, but not confidential "
|
| 158 |
+
"documents outside business hours."
|
| 159 |
+
),
|
| 160 |
+
"senior": (
|
| 161 |
+
"Senior employees have unrestricted access to all documents "
|
| 162 |
+
"at all times."
|
| 163 |
+
),
|
| 164 |
+
"contractor": (
|
| 165 |
+
"Contractors can only access public documents. They cannot access "
|
| 166 |
+
"internal or confidential documents at any time."
|
| 167 |
+
),
|
| 168 |
+
"confidential": (
|
| 169 |
+
"Confidential documents include board minutes, salary data, and "
|
| 170 |
+
"strategic plans. Access is highly restricted."
|
| 171 |
+
),
|
| 172 |
+
"internal": (
|
| 173 |
+
"Internal documents include team wikis, project plans, and "
|
| 174 |
+
"internal communications."
|
| 175 |
+
),
|
| 176 |
+
"public": (
|
| 177 |
+
"Public documents include published reports, press releases, and "
|
| 178 |
+
"public-facing content. No access restrictions."
|
| 179 |
+
),
|
| 180 |
+
"hours": (
|
| 181 |
+
"Business hours are 8 AM to 5 PM."
|
| 182 |
+
),
|
| 183 |
+
|
| 184 |
+
# ββ Level 2: Medium specificity (common phrases β more detail) ββ
|
| 185 |
+
"business hours": (
|
| 186 |
+
"Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format). "
|
| 187 |
+
"Access permissions change based on whether the current hour falls "
|
| 188 |
+
"within this range."
|
| 189 |
+
),
|
| 190 |
+
"work hours": (
|
| 191 |
+
"Business hours are 8:00 AM to 5:00 PM (8 to 17 in 24-hour format)."
|
| 192 |
+
),
|
| 193 |
+
"outside business": (
|
| 194 |
+
"Outside business hours includes early morning and evening. "
|
| 195 |
+
"Restrictions are tighter outside this window for junior staff."
|
| 196 |
+
),
|
| 197 |
+
|
| 198 |
+
# ββ Level 3: Precise (compound keywords β ground truth aligned) ββ
|
| 199 |
+
# These answers reveal the FULL truth, correcting any misleading
|
| 200 |
+
# impressions from Level 1 answers.
|
| 201 |
+
"junior confidential": (
|
| 202 |
+
"Junior employees CANNOT access confidential documents at ANY time, "
|
| 203 |
+
"regardless of whether it is during or outside business hours. "
|
| 204 |
+
"The policy statement about 'outside business hours' is a minimum "
|
| 205 |
+
"restriction β the actual rule is a blanket denial of confidential "
|
| 206 |
+
"access for juniors."
|
| 207 |
+
),
|
| 208 |
+
"junior internal": (
|
| 209 |
+
"Junior employees can access internal documents ONLY during business "
|
| 210 |
+
"hours (hour >= 8 AND hour < 17). Outside business hours, internal "
|
| 211 |
+
"documents are denied for juniors."
|
| 212 |
+
),
|
| 213 |
+
"junior public": (
|
| 214 |
+
"Junior employees can access public documents at any time. "
|
| 215 |
+
"Public access has no restrictions for any role."
|
| 216 |
+
),
|
| 217 |
+
"business hours boundary": (
|
| 218 |
+
"Business hours use a half-open interval: hour >= 8 AND hour < 17. "
|
| 219 |
+
"Hour 8 is the first business hour. Hour 16 is the last business hour. "
|
| 220 |
+
"Hour 17 (5:00 PM) is NOT within business hours."
|
| 221 |
+
),
|
| 222 |
+
"exactly 17": (
|
| 223 |
+
"Hour 17 (5:00 PM) is considered outside business hours. "
|
| 224 |
+
"The business hours window ends BEFORE 17. Junior employees lose "
|
| 225 |
+
"access to internal documents at hour 17."
|
| 226 |
+
),
|
| 227 |
+
"time boundary": (
|
| 228 |
+
"Business hours are hour >= 8 AND hour < 17. "
|
| 229 |
+
"Hour 8 is inside, hour 17 is outside. "
|
| 230 |
+
"The last valid business hour is 16."
|
| 231 |
+
),
|
| 232 |
+
"confidential during": (
|
| 233 |
+
"Confidential documents are NOT accessible to junior employees "
|
| 234 |
+
"during business hours. The policy only explicitly mentions the "
|
| 235 |
+
"'outside business hours' restriction, but the actual rule denies "
|
| 236 |
+
"junior access to confidential at all times."
|
| 237 |
+
),
|
| 238 |
+
"contractor internal": (
|
| 239 |
+
"Contractors cannot access internal documents. They are restricted "
|
| 240 |
+
"to public documents only, regardless of time."
|
| 241 |
+
),
|
| 242 |
},
|
| 243 |
max_steps=7,
|
| 244 |
scenario_count=50,
|
|
|
|
| 273 |
"business_end": 17,
|
| 274 |
},
|
| 275 |
clarification_map={
|
| 276 |
+
# ββ Level 1: General (single keyword β partial/vague) ββ
|
| 277 |
+
"limit": (
|
| 278 |
+
"The standard transaction limit is $5,000."
|
| 279 |
+
),
|
| 280 |
+
"international": (
|
| 281 |
+
"All international transfers require compliance review, "
|
| 282 |
+
"regardless of amount or time."
|
| 283 |
+
),
|
| 284 |
+
"manager": (
|
| 285 |
+
"Manager-initiated transactions are exempt from the standard "
|
| 286 |
+
"limit requirement. They are auto-approved for domestic "
|
| 287 |
+
"transactions unless other rules apply."
|
| 288 |
+
),
|
| 289 |
+
"domestic": (
|
| 290 |
+
"Domestic transactions follow different rules based on amount "
|
| 291 |
+
"and time of day."
|
| 292 |
+
),
|
| 293 |
+
"compliance": (
|
| 294 |
+
"Compliance review is required for all international transfers. "
|
| 295 |
+
"It is a separate process from manager approval."
|
| 296 |
+
),
|
| 297 |
+
"routine": (
|
| 298 |
+
"Routine transactions are domestic transactions within the "
|
| 299 |
+
"standard limit."
|
| 300 |
+
),
|
| 301 |
+
"system": (
|
| 302 |
+
"System-initiated transactions follow the same rules as "
|
| 303 |
+
"employee-initiated ones."
|
| 304 |
+
),
|
| 305 |
+
"hours": (
|
| 306 |
+
"Business hours are 9 AM to 5 PM."
|
| 307 |
+
),
|
| 308 |
+
"exempt": (
|
| 309 |
+
"Manager-initiated transactions are exempt from the standard "
|
| 310 |
+
"limit. However, other rules may still apply."
|
| 311 |
+
),
|
| 312 |
+
|
| 313 |
+
# ββ Level 2: Medium specificity ββ
|
| 314 |
+
"standard limit": (
|
| 315 |
+
"The standard transaction limit is $5,000. Transactions above "
|
| 316 |
+
"this amount require manager approval unless the initiator is "
|
| 317 |
+
"a manager."
|
| 318 |
+
),
|
| 319 |
+
"high-value": (
|
| 320 |
+
"High-value domestic transactions are those with an amount of "
|
| 321 |
+
"$10,000 or more."
|
| 322 |
+
),
|
| 323 |
+
"high value": (
|
| 324 |
+
"High-value domestic transactions are those with an amount of "
|
| 325 |
+
"$10,000 or more."
|
| 326 |
+
),
|
| 327 |
+
"threshold": (
|
| 328 |
+
"The standard limit is $5,000. The high-value threshold for "
|
| 329 |
+
"domestic transactions is $10,000."
|
| 330 |
+
),
|
| 331 |
+
"business hours": (
|
| 332 |
+
"Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format)."
|
| 333 |
+
),
|
| 334 |
+
"work hours": (
|
| 335 |
+
"Business hours are 9:00 AM to 5:00 PM (9 to 17 in 24-hour format)."
|
| 336 |
+
),
|
| 337 |
+
"non-business": (
|
| 338 |
+
"Non-business hours means outside the 9 AM to 5 PM window, "
|
| 339 |
+
"including the 5 PM hour itself."
|
| 340 |
+
),
|
| 341 |
+
|
| 342 |
+
# ββ Level 3: Precise (compound keywords β ground truth) ββ
|
| 343 |
+
"exactly 5000": (
|
| 344 |
+
"A transaction of exactly $5,000 is WITHIN the standard limit and "
|
| 345 |
+
"is auto-approved for domestic transactions. Only amounts STRICTLY "
|
| 346 |
+
"above $5,000 (i.e., $5,001+) trigger the approval requirement. "
|
| 347 |
+
"The comparison is amount > 5000, not amount >= 5000."
|
| 348 |
+
),
|
| 349 |
+
"exactly 10000": (
|
| 350 |
+
"A domestic transaction of exactly $10,000 IS considered high-value. "
|
| 351 |
+
"The threshold is amount >= 10000. A $9,999 transaction is NOT "
|
| 352 |
+
"high-value."
|
| 353 |
+
),
|
| 354 |
+
"exactly 17": (
|
| 355 |
+
"Hour 17 (5:00 PM) is considered non-business hours. The business "
|
| 356 |
+
"hours window ends BEFORE 17. A high-value domestic transaction at "
|
| 357 |
+
"hour 17 would be held for review."
|
| 358 |
+
),
|
| 359 |
+
"business hours boundary": (
|
| 360 |
+
"Business hours use a half-open interval: hour >= 9 AND hour < 17. "
|
| 361 |
+
"Hour 9 is business hours. Hour 16 is the last business hour. "
|
| 362 |
+
"Hour 17 (5:00 PM) is NOT business hours."
|
| 363 |
+
),
|
| 364 |
+
"time boundary": (
|
| 365 |
+
"Business hours are hour >= 9 AND hour < 17. "
|
| 366 |
+
"Hour 9 is inside, hour 17 is outside."
|
| 367 |
+
),
|
| 368 |
+
"manager exempt": (
|
| 369 |
+
"Manager-initiated transactions are exempt from the standard $5,000 "
|
| 370 |
+
"limit only. They are NOT exempt from international compliance review "
|
| 371 |
+
"or the high-value domestic HOLD rule. A manager's $10,000 domestic "
|
| 372 |
+
"transaction outside business hours is still HELD."
|
| 373 |
+
),
|
| 374 |
+
"manager high-value": (
|
| 375 |
+
"Manager exemption only applies to the standard limit ($5,000). "
|
| 376 |
+
"Managers are still subject to: (1) COMPLIANCE_REVIEW for international "
|
| 377 |
+
"transfers, and (2) HOLD for high-value domestic transactions (>= $10,000) "
|
| 378 |
+
"outside business hours. The exemption is narrow."
|
| 379 |
+
),
|
| 380 |
+
"manager international": (
|
| 381 |
+
"Even manager-initiated international transfers require COMPLIANCE_REVIEW. "
|
| 382 |
+
"The manager exemption does not override the international transfer rule."
|
| 383 |
+
),
|
| 384 |
+
"domestic hold": (
|
| 385 |
+
"A domestic transaction is HELD when: (1) amount >= $10,000 AND "
|
| 386 |
+
"(2) the hour is outside business hours (hour < 9 or hour >= 17). "
|
| 387 |
+
"Both conditions must be true. During business hours, high-value "
|
| 388 |
+
"domestic transactions get REQUIRE_APPROVAL instead (if not manager-initiated)."
|
| 389 |
+
),
|
| 390 |
+
"rule priority": (
|
| 391 |
+
"Rules are evaluated in priority order: "
|
| 392 |
+
"(1) International transfers β COMPLIANCE_REVIEW always. "
|
| 393 |
+
"(2) High-value domestic (>= $10,000) outside business hours β HOLD. "
|
| 394 |
+
"(3) Above standard limit (> $5,000) and not manager β REQUIRE_APPROVAL. "
|
| 395 |
+
"(4) Everything else β APPROVE."
|
| 396 |
+
),
|
| 397 |
+
"international manager": (
|
| 398 |
+
"International transfers ALWAYS go to COMPLIANCE_REVIEW, even for "
|
| 399 |
+
"managers. This is the highest-priority rule."
|
| 400 |
+
),
|
| 401 |
},
|
| 402 |
max_steps=7,
|
| 403 |
scenario_count=80,
|
test_hf_spaces.py
CHANGED
|
@@ -4,6 +4,7 @@ HF Spaces Test Runner - Policy-to-Logic Environment
|
|
| 4 |
Tests all endpoints on the deployed HF Spaces and generates a report.
|
| 5 |
|
| 6 |
Run it:
|
|
|
|
| 7 |
uv run python test_hf_spaces.py
|
| 8 |
"""
|
| 9 |
|
|
@@ -12,12 +13,16 @@ import json
|
|
| 12 |
import time
|
| 13 |
from typing import Dict, Any, List
|
| 14 |
from datetime import datetime
|
| 15 |
-
from urllib.parse import urljoin
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
-
#
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
class HFSpacesTestRunner:
|
| 23 |
def __init__(self, base_url: str):
|
|
@@ -26,7 +31,7 @@ class HFSpacesTestRunner:
|
|
| 26 |
self.passed = 0
|
| 27 |
self.failed = 0
|
| 28 |
self.start_time = datetime.now()
|
| 29 |
-
|
| 30 |
def log(self, message: str, level: str = "INFO"):
|
| 31 |
"""Print formatted log message."""
|
| 32 |
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
@@ -38,22 +43,81 @@ class HFSpacesTestRunner:
|
|
| 38 |
"TEST": "π§ͺ"
|
| 39 |
}
|
| 40 |
print(f"[{timestamp}] {prefix.get(level, '')} {message}")
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""Test an HF Spaces endpoint and record result."""
|
| 45 |
-
url =
|
| 46 |
display_name = description or endpoint
|
| 47 |
self.log(f"Testing: {display_name}", "TEST")
|
| 48 |
-
|
| 49 |
try:
|
| 50 |
if method == "POST":
|
| 51 |
response = requests.post(url, json=data, timeout=timeout)
|
| 52 |
else:
|
| 53 |
response = requests.get(url, timeout=timeout)
|
| 54 |
-
|
| 55 |
success = response.status_code in [200, 201]
|
| 56 |
-
|
| 57 |
result = {
|
| 58 |
"test": display_name,
|
| 59 |
"endpoint": endpoint,
|
|
@@ -64,14 +128,14 @@ class HFSpacesTestRunner:
|
|
| 64 |
"response": None,
|
| 65 |
"error": None
|
| 66 |
}
|
| 67 |
-
|
| 68 |
try:
|
| 69 |
result["response"] = response.json()
|
| 70 |
-
except:
|
| 71 |
result["response"] = response.text[:300]
|
| 72 |
-
|
| 73 |
self.results.append(result)
|
| 74 |
-
|
| 75 |
if success:
|
| 76 |
self.passed += 1
|
| 77 |
self.log(f" Status: {response.status_code} - PASSED", "SUCCESS")
|
|
@@ -79,9 +143,9 @@ class HFSpacesTestRunner:
|
|
| 79 |
self.failed += 1
|
| 80 |
self.log(f" Status: {response.status_code} - FAILED", "ERROR")
|
| 81 |
result["error"] = response.text[:200]
|
| 82 |
-
|
| 83 |
return success
|
| 84 |
-
|
| 85 |
except requests.exceptions.Timeout:
|
| 86 |
self.failed += 1
|
| 87 |
self.log(f" TIMEOUT (>{timeout}s)", "ERROR")
|
|
@@ -124,63 +188,107 @@ class HFSpacesTestRunner:
|
|
| 124 |
"error": str(e)
|
| 125 |
})
|
| 126 |
return False
|
| 127 |
-
|
|
|
|
| 128 |
def run_tests(self) -> bool:
|
| 129 |
"""Run all endpoint tests."""
|
| 130 |
-
self.log("Starting
|
| 131 |
print("\n" + "="*70)
|
| 132 |
-
|
| 133 |
# Test 1: Root endpoint
|
| 134 |
self.test_endpoint("GET", "/", description="Root Endpoint")
|
| 135 |
-
|
| 136 |
# Test 2: Health check
|
| 137 |
self.test_endpoint("GET", "/health", description="Health Check")
|
| 138 |
-
|
| 139 |
# Test 3: Root with query params (HF Spaces probe)
|
| 140 |
self.test_endpoint("GET", "/?logs=container", description="Root with Logs Query")
|
| 141 |
-
|
| 142 |
# Test 4: List tasks
|
| 143 |
self.test_endpoint("GET", "/tasks", description="List Available Tasks")
|
| 144 |
-
|
| 145 |
# Test 5: Reset environment
|
| 146 |
reset_result = self.test_endpoint(
|
| 147 |
-
"POST",
|
| 148 |
-
"/reset",
|
| 149 |
data={"task_name": None},
|
| 150 |
description="Reset Environment (Start Episode)"
|
| 151 |
)
|
| 152 |
-
|
| 153 |
# Test 6: Get state
|
| 154 |
self.test_endpoint("GET", "/state", description="Get Current State")
|
| 155 |
-
|
| 156 |
-
# Test 7: Ask clarification
|
| 157 |
if reset_result:
|
|
|
|
| 158 |
self.test_endpoint(
|
| 159 |
"POST",
|
| 160 |
"/step",
|
| 161 |
data={
|
| 162 |
"action_type": "ask_clarification",
|
| 163 |
-
"content": "What are the
|
| 164 |
},
|
| 165 |
-
description="Step: Ask Clarification"
|
| 166 |
)
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
print("="*70 + "\n")
|
| 172 |
return self.failed == 0
|
| 173 |
-
|
|
|
|
| 174 |
def generate_report(self):
|
| 175 |
"""Generate and print test report."""
|
| 176 |
duration = (datetime.now() - self.start_time).total_seconds()
|
| 177 |
total = self.passed + self.failed
|
| 178 |
success_rate = 100 * self.passed / total if total > 0 else 0
|
| 179 |
-
|
| 180 |
print("\n" + "="*70)
|
| 181 |
print("π HF SPACES TEST REPORT")
|
| 182 |
print("="*70)
|
| 183 |
print(f"Space URL: {self.base_url}")
|
|
|
|
| 184 |
print(f"Timestamp: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 185 |
print(f"Duration: {duration:.2f}s")
|
| 186 |
print(f"Total Tests: {total}")
|
|
@@ -188,7 +296,7 @@ class HFSpacesTestRunner:
|
|
| 188 |
print(f"Failed: {self.failed} β")
|
| 189 |
print(f"Success Rate: {success_rate:.1f}%")
|
| 190 |
print("="*70)
|
| 191 |
-
|
| 192 |
print("\nπ DETAILED RESULTS:\n")
|
| 193 |
for i, result in enumerate(self.results, 1):
|
| 194 |
status_icon = "β
" if result["success"] else "β"
|
|
@@ -197,41 +305,53 @@ class HFSpacesTestRunner:
|
|
| 197 |
print(f" URL: {result['url']}")
|
| 198 |
if result['status_code']:
|
| 199 |
print(f" Status: {result['status_code']}")
|
| 200 |
-
|
| 201 |
if result['error']:
|
| 202 |
print(f" Error: {result['error']}")
|
| 203 |
elif result['response']:
|
| 204 |
response_preview = result['response']
|
| 205 |
if isinstance(response_preview, dict):
|
| 206 |
-
response_preview = json.dumps(response_preview, indent=4)[:
|
| 207 |
else:
|
| 208 |
-
response_preview = str(response_preview)[:
|
| 209 |
print(f" Response: {response_preview}...")
|
| 210 |
print()
|
| 211 |
-
|
| 212 |
print("="*70)
|
| 213 |
if self.failed == 0:
|
| 214 |
print("π ALL TESTS PASSED - HF SPACES IS RUNNING!")
|
| 215 |
else:
|
| 216 |
print(f"β οΈ {self.failed} test(s) failed. Check details above.")
|
| 217 |
print("="*70 + "\n")
|
| 218 |
-
|
| 219 |
return self.failed == 0
|
| 220 |
-
|
|
|
|
| 221 |
def run(self) -> bool:
|
| 222 |
"""Run the entire test suite."""
|
| 223 |
print("\nπ Policy-to-Logic RL Environment - HF Spaces Test Suite\n")
|
| 224 |
-
|
| 225 |
-
self.log(f"Target
|
| 226 |
-
|
| 227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
all_passed = self.run_tests()
|
| 229 |
-
|
| 230 |
-
# Generate report
|
| 231 |
self.generate_report()
|
| 232 |
-
|
| 233 |
return all_passed
|
| 234 |
|
|
|
|
| 235 |
def main():
|
| 236 |
runner = HFSpacesTestRunner(BASE_URL)
|
| 237 |
try:
|
|
@@ -242,6 +362,8 @@ def main():
|
|
| 242 |
exit(1)
|
| 243 |
except Exception as e:
|
| 244 |
print(f"\nβ Unexpected error: {e}")
|
|
|
|
|
|
|
| 245 |
exit(1)
|
| 246 |
|
| 247 |
if __name__ == "__main__":
|
|
|
|
| 4 |
Tests all endpoints on the deployed HF Spaces and generates a report.
|
| 5 |
|
| 6 |
Run it:
|
| 7 |
+
$env:UV_LINK_MODE="copy"
|
| 8 |
uv run python test_hf_spaces.py
|
| 9 |
"""
|
| 10 |
|
|
|
|
| 13 |
import time
|
| 14 |
from typing import Dict, Any, List
|
| 15 |
from datetime import datetime
|
|
|
|
| 16 |
|
| 17 |
+
# βββ URL Construction βββββββββββββββββββββββββββββββββββββββββββββ
|
| 18 |
+
# HF Spaces URL β API endpoint
|
| 19 |
+
# https://huggingface.co/spaces/{user}/{repo} β https://{user}-{repo}.hf.space
|
| 20 |
+
HF_SPACE_WEB_URL = "https://huggingface.co/spaces/Godreign/Policy2Logic"
|
| 21 |
+
parts = HF_SPACE_WEB_URL.split('/')
|
| 22 |
+
username = parts[-2] # "Godreign"
|
| 23 |
+
repo_name = parts[-1] # "Policy2Logic"
|
| 24 |
+
BASE_URL = f"https://{username.lower()}-{repo_name.lower()}.hf.space"
|
| 25 |
+
|
| 26 |
|
| 27 |
class HFSpacesTestRunner:
|
| 28 |
def __init__(self, base_url: str):
|
|
|
|
| 31 |
self.passed = 0
|
| 32 |
self.failed = 0
|
| 33 |
self.start_time = datetime.now()
|
| 34 |
+
|
| 35 |
def log(self, message: str, level: str = "INFO"):
|
| 36 |
"""Print formatted log message."""
|
| 37 |
timestamp = datetime.now().strftime("%H:%M:%S")
|
|
|
|
| 43 |
"TEST": "π§ͺ"
|
| 44 |
}
|
| 45 |
print(f"[{timestamp}] {prefix.get(level, '')} {message}")
|
| 46 |
+
|
| 47 |
+
# ββ Connectivity Check ββββββββββββββββββββββββββββββββββββββββ
|
| 48 |
+
def check_connectivity(self) -> bool:
|
| 49 |
+
"""
|
| 50 |
+
Verify we can reach the HF Space before running tests.
|
| 51 |
+
Returns True if the space is reachable and responding.
|
| 52 |
+
"""
|
| 53 |
+
print(f"\nπ Connectivity Check")
|
| 54 |
+
print(f" Target URL: {self.base_url}")
|
| 55 |
+
print()
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# First, check with allow_redirects=False to detect proxy issues
|
| 59 |
+
resp = requests.get(
|
| 60 |
+
self.base_url,
|
| 61 |
+
timeout=15,
|
| 62 |
+
allow_redirects=False,
|
| 63 |
+
)
|
| 64 |
+
print(f" Direct response: status={resp.status_code}")
|
| 65 |
+
|
| 66 |
+
if resp.is_redirect or resp.is_permanent_redirect:
|
| 67 |
+
redirect_url = resp.headers.get("Location", "unknown")
|
| 68 |
+
print(f" β οΈ REDIRECT detected β {redirect_url}")
|
| 69 |
+
print(f" The space may not be running or the URL format changed.")
|
| 70 |
+
print(f" Expected API base: {self.base_url}")
|
| 71 |
+
return False
|
| 72 |
+
|
| 73 |
+
# Now check with redirects allowed (normal mode)
|
| 74 |
+
resp = requests.get(self.base_url, timeout=15)
|
| 75 |
+
if resp.status_code == 200:
|
| 76 |
+
try:
|
| 77 |
+
data = resp.json()
|
| 78 |
+
if data.get("status") == "running":
|
| 79 |
+
print(f" β
Space is RUNNING")
|
| 80 |
+
print(f" Response: {json.dumps(data, indent=2)[:200]}")
|
| 81 |
+
return True
|
| 82 |
+
else:
|
| 83 |
+
print(f" β οΈ Unexpected response: {data}")
|
| 84 |
+
return True # Still reachable
|
| 85 |
+
except ValueError:
|
| 86 |
+
print(f" β οΈ Non-JSON response (may be HF loading page)")
|
| 87 |
+
print(f" Body preview: {resp.text[:200]}")
|
| 88 |
+
return False
|
| 89 |
+
else:
|
| 90 |
+
print(f" β Got status {resp.status_code}")
|
| 91 |
+
print(f" Body: {resp.text[:200]}")
|
| 92 |
+
return False
|
| 93 |
+
|
| 94 |
+
except requests.exceptions.Timeout:
|
| 95 |
+
print(f" β Connection TIMEOUT (>15s)")
|
| 96 |
+
print(f" The space may be sleeping. Visit {HF_SPACE_WEB_URL} to wake it.")
|
| 97 |
+
return False
|
| 98 |
+
except requests.exceptions.ConnectionError as e:
|
| 99 |
+
print(f" β Connection FAILED: {str(e)[:150]}")
|
| 100 |
+
return False
|
| 101 |
+
except Exception as e:
|
| 102 |
+
print(f" β Unexpected error: {e}")
|
| 103 |
+
return False
|
| 104 |
+
|
| 105 |
+
# ββ Endpoint Testing ββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
+
def test_endpoint(self, method: str, endpoint: str, data: dict = None,
|
| 107 |
+
description: str = "", timeout: int = 15) -> bool:
|
| 108 |
"""Test an HF Spaces endpoint and record result."""
|
| 109 |
+
url = f"{self.base_url}{endpoint}"
|
| 110 |
display_name = description or endpoint
|
| 111 |
self.log(f"Testing: {display_name}", "TEST")
|
| 112 |
+
|
| 113 |
try:
|
| 114 |
if method == "POST":
|
| 115 |
response = requests.post(url, json=data, timeout=timeout)
|
| 116 |
else:
|
| 117 |
response = requests.get(url, timeout=timeout)
|
| 118 |
+
|
| 119 |
success = response.status_code in [200, 201]
|
| 120 |
+
|
| 121 |
result = {
|
| 122 |
"test": display_name,
|
| 123 |
"endpoint": endpoint,
|
|
|
|
| 128 |
"response": None,
|
| 129 |
"error": None
|
| 130 |
}
|
| 131 |
+
|
| 132 |
try:
|
| 133 |
result["response"] = response.json()
|
| 134 |
+
except ValueError:
|
| 135 |
result["response"] = response.text[:300]
|
| 136 |
+
|
| 137 |
self.results.append(result)
|
| 138 |
+
|
| 139 |
if success:
|
| 140 |
self.passed += 1
|
| 141 |
self.log(f" Status: {response.status_code} - PASSED", "SUCCESS")
|
|
|
|
| 143 |
self.failed += 1
|
| 144 |
self.log(f" Status: {response.status_code} - FAILED", "ERROR")
|
| 145 |
result["error"] = response.text[:200]
|
| 146 |
+
|
| 147 |
return success
|
| 148 |
+
|
| 149 |
except requests.exceptions.Timeout:
|
| 150 |
self.failed += 1
|
| 151 |
self.log(f" TIMEOUT (>{timeout}s)", "ERROR")
|
|
|
|
| 188 |
"error": str(e)
|
| 189 |
})
|
| 190 |
return False
|
| 191 |
+
|
| 192 |
+
# ββ Test Suite ββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 193 |
def run_tests(self) -> bool:
|
| 194 |
"""Run all endpoint tests."""
|
| 195 |
+
self.log("Starting endpoint tests...", "INFO")
|
| 196 |
print("\n" + "="*70)
|
| 197 |
+
|
| 198 |
# Test 1: Root endpoint
|
| 199 |
self.test_endpoint("GET", "/", description="Root Endpoint")
|
| 200 |
+
|
| 201 |
# Test 2: Health check
|
| 202 |
self.test_endpoint("GET", "/health", description="Health Check")
|
| 203 |
+
|
| 204 |
# Test 3: Root with query params (HF Spaces probe)
|
| 205 |
self.test_endpoint("GET", "/?logs=container", description="Root with Logs Query")
|
| 206 |
+
|
| 207 |
# Test 4: List tasks
|
| 208 |
self.test_endpoint("GET", "/tasks", description="List Available Tasks")
|
| 209 |
+
|
| 210 |
# Test 5: Reset environment
|
| 211 |
reset_result = self.test_endpoint(
|
| 212 |
+
"POST",
|
| 213 |
+
"/reset",
|
| 214 |
data={"task_name": None},
|
| 215 |
description="Reset Environment (Start Episode)"
|
| 216 |
)
|
| 217 |
+
|
| 218 |
# Test 6: Get state
|
| 219 |
self.test_endpoint("GET", "/state", description="Get Current State")
|
| 220 |
+
|
| 221 |
+
# Test 7: Ask clarification (progressive revelation test)
|
| 222 |
if reset_result:
|
| 223 |
+
# Level 1 - vague question
|
| 224 |
self.test_endpoint(
|
| 225 |
"POST",
|
| 226 |
"/step",
|
| 227 |
data={
|
| 228 |
"action_type": "ask_clarification",
|
| 229 |
+
"content": "What are the working hours?"
|
| 230 |
},
|
| 231 |
+
description="Step: Ask Clarification (Level 1 - vague)"
|
| 232 |
)
|
| 233 |
+
|
| 234 |
+
# Level 3 - specific compound question
|
| 235 |
+
self.test_endpoint(
|
| 236 |
+
"POST",
|
| 237 |
+
"/step",
|
| 238 |
+
data={
|
| 239 |
+
"action_type": "ask_clarification",
|
| 240 |
+
"content": "What happens at the working hours boundary, exactly at hour 18?"
|
| 241 |
+
},
|
| 242 |
+
description="Step: Ask Clarification (Level 3 - precise)"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
# Test 8: Propose rules (valid DSL)
|
| 246 |
+
if reset_result:
|
| 247 |
+
self.test_endpoint(
|
| 248 |
+
"POST",
|
| 249 |
+
"/step",
|
| 250 |
+
data={
|
| 251 |
+
"action_type": "propose_rules",
|
| 252 |
+
"content": json.dumps({
|
| 253 |
+
"rules": [
|
| 254 |
+
{
|
| 255 |
+
"if": [
|
| 256 |
+
{"field": "data_type", "op": "==", "value": "public"}
|
| 257 |
+
],
|
| 258 |
+
"then": "ALLOW"
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"if": [
|
| 262 |
+
{"field": "time", "op": ">=", "value": 9},
|
| 263 |
+
{"field": "time", "op": "<", "value": 18}
|
| 264 |
+
],
|
| 265 |
+
"then": "ALLOW"
|
| 266 |
+
}
|
| 267 |
+
],
|
| 268 |
+
"default": "DENY"
|
| 269 |
+
})
|
| 270 |
+
},
|
| 271 |
+
description="Step: Propose Rules (Valid DSL)"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Test 9: Final state
|
| 275 |
+
self.test_endpoint("GET", "/state", description="Get Final State After Steps")
|
| 276 |
+
|
| 277 |
print("="*70 + "\n")
|
| 278 |
return self.failed == 0
|
| 279 |
+
|
| 280 |
+
# ββ Report Generation βββββββββββββββββββββββββββββββββββββββββ
|
| 281 |
def generate_report(self):
|
| 282 |
"""Generate and print test report."""
|
| 283 |
duration = (datetime.now() - self.start_time).total_seconds()
|
| 284 |
total = self.passed + self.failed
|
| 285 |
success_rate = 100 * self.passed / total if total > 0 else 0
|
| 286 |
+
|
| 287 |
print("\n" + "="*70)
|
| 288 |
print("π HF SPACES TEST REPORT")
|
| 289 |
print("="*70)
|
| 290 |
print(f"Space URL: {self.base_url}")
|
| 291 |
+
print(f"Web URL: {HF_SPACE_WEB_URL}")
|
| 292 |
print(f"Timestamp: {self.start_time.strftime('%Y-%m-%d %H:%M:%S')}")
|
| 293 |
print(f"Duration: {duration:.2f}s")
|
| 294 |
print(f"Total Tests: {total}")
|
|
|
|
| 296 |
print(f"Failed: {self.failed} β")
|
| 297 |
print(f"Success Rate: {success_rate:.1f}%")
|
| 298 |
print("="*70)
|
| 299 |
+
|
| 300 |
print("\nπ DETAILED RESULTS:\n")
|
| 301 |
for i, result in enumerate(self.results, 1):
|
| 302 |
status_icon = "β
" if result["success"] else "β"
|
|
|
|
| 305 |
print(f" URL: {result['url']}")
|
| 306 |
if result['status_code']:
|
| 307 |
print(f" Status: {result['status_code']}")
|
| 308 |
+
|
| 309 |
if result['error']:
|
| 310 |
print(f" Error: {result['error']}")
|
| 311 |
elif result['response']:
|
| 312 |
response_preview = result['response']
|
| 313 |
if isinstance(response_preview, dict):
|
| 314 |
+
response_preview = json.dumps(response_preview, indent=4)[:300]
|
| 315 |
else:
|
| 316 |
+
response_preview = str(response_preview)[:300]
|
| 317 |
print(f" Response: {response_preview}...")
|
| 318 |
print()
|
| 319 |
+
|
| 320 |
print("="*70)
|
| 321 |
if self.failed == 0:
|
| 322 |
print("π ALL TESTS PASSED - HF SPACES IS RUNNING!")
|
| 323 |
else:
|
| 324 |
print(f"β οΈ {self.failed} test(s) failed. Check details above.")
|
| 325 |
print("="*70 + "\n")
|
| 326 |
+
|
| 327 |
return self.failed == 0
|
| 328 |
+
|
| 329 |
+
# ββ Main Runner βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 330 |
def run(self) -> bool:
|
| 331 |
"""Run the entire test suite."""
|
| 332 |
print("\nπ Policy-to-Logic RL Environment - HF Spaces Test Suite\n")
|
| 333 |
+
|
| 334 |
+
self.log(f"Target API: {self.base_url}", "INFO")
|
| 335 |
+
self.log(f"Source URL: {HF_SPACE_WEB_URL}", "INFO")
|
| 336 |
+
|
| 337 |
+
# Step 1: Connectivity check
|
| 338 |
+
if not self.check_connectivity():
|
| 339 |
+
print("\nβ Connectivity check FAILED. Cannot proceed with tests.")
|
| 340 |
+
print(f" Verify the space is running at: {HF_SPACE_WEB_URL}")
|
| 341 |
+
print(f" Expected API endpoint: {self.base_url}")
|
| 342 |
+
return False
|
| 343 |
+
|
| 344 |
+
print()
|
| 345 |
+
|
| 346 |
+
# Step 2: Run endpoint tests
|
| 347 |
all_passed = self.run_tests()
|
| 348 |
+
|
| 349 |
+
# Step 3: Generate report
|
| 350 |
self.generate_report()
|
| 351 |
+
|
| 352 |
return all_passed
|
| 353 |
|
| 354 |
+
|
| 355 |
def main():
|
| 356 |
runner = HFSpacesTestRunner(BASE_URL)
|
| 357 |
try:
|
|
|
|
| 362 |
exit(1)
|
| 363 |
except Exception as e:
|
| 364 |
print(f"\nβ Unexpected error: {e}")
|
| 365 |
+
import traceback
|
| 366 |
+
traceback.print_exc()
|
| 367 |
exit(1)
|
| 368 |
|
| 369 |
if __name__ == "__main__":
|