Spaces:
Sleeping
Sleeping
aatmk-panse commited on
Commit ·
c5968b0
1
Parent(s): 50cd687
Add 3 tasks with graders and clamp scores to (0, 1)
Browse files- Add tasks section to openenv.yaml with grader references for all 3 tasks
(product_listing_qa, seo_collection_optimization, full_store_audit)
- Create server/graders.py with grade functions and safe_reward clamping
- Clamp health_score to [0.01, 0.99] in store.py, environment, and inference
- Fixes "Not enough tasks with graders" and "scores out of range" validation
shopify_store_audit/inference.py
CHANGED
|
@@ -229,8 +229,8 @@ async def main() -> None:
|
|
| 229 |
if done:
|
| 230 |
break
|
| 231 |
|
| 232 |
-
score = obs.store_health_score if obs else 0.
|
| 233 |
-
score = min(max(score, 0.
|
| 234 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 235 |
|
| 236 |
finally:
|
|
|
|
| 229 |
if done:
|
| 230 |
break
|
| 231 |
|
| 232 |
+
score = obs.store_health_score if obs else 0.01
|
| 233 |
+
score = min(max(score, 0.01), 0.99)
|
| 234 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 235 |
|
| 236 |
finally:
|
shopify_store_audit/openenv.yaml
CHANGED
|
@@ -1,6 +1,24 @@
|
|
| 1 |
spec_version: 1
|
| 2 |
name: shopify_store_audit
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: server.app:app
|
| 6 |
port: 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
spec_version: 1
|
| 2 |
name: shopify_store_audit
|
| 3 |
+
description: >
|
| 4 |
+
Simulated Shopify store audit environment where an AI agent discovers and
|
| 5 |
+
fixes real e-commerce issues across product data, SEO, inventory, collections,
|
| 6 |
+
and orders. 3 tasks (easy/medium/hard) with graded difficulty.
|
| 7 |
type: space
|
| 8 |
runtime: fastapi
|
| 9 |
app: server.app:app
|
| 10 |
port: 8000
|
| 11 |
+
|
| 12 |
+
tasks:
|
| 13 |
+
- id: product_listing_qa
|
| 14 |
+
description: "8 product data issues — missing descriptions, broken pricing, absent images, draft products."
|
| 15 |
+
grader: server.graders.grade_product_listing_qa
|
| 16 |
+
difficulty: easy
|
| 17 |
+
- id: seo_collection_optimization
|
| 18 |
+
description: "12 SEO and catalog issues — missing SEO metadata, empty collections, broken rules, missing alt text."
|
| 19 |
+
grader: server.graders.grade_seo_collection_optimization
|
| 20 |
+
difficulty: medium
|
| 21 |
+
- id: full_store_audit
|
| 22 |
+
description: "20 cross-cutting issues — product data, SEO, inventory, collections, orders, and metafields."
|
| 23 |
+
grader: server.graders.grade_full_store_audit
|
| 24 |
+
difficulty: hard
|
shopify_store_audit/server/graders.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Graders for the Shopify Store Audit environment.
|
| 3 |
+
|
| 4 |
+
Each grader evaluates the agent's performance on a specific task by analysing
|
| 5 |
+
the final observation returned by the environment. Scores are clamped to the
|
| 6 |
+
open interval (0.01, 0.99) to satisfy OpenEnv validation constraints that
|
| 7 |
+
require scores strictly between 0 and 1.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from typing import Any, Dict
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def safe_reward(raw: float) -> float:
|
| 16 |
+
"""Clamp reward strictly between 0.01 and 0.99."""
|
| 17 |
+
return round(min(max(float(raw), 0.01), 0.99), 4)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _score_from_observation(observation: Dict[str, Any]) -> float:
|
| 21 |
+
"""Extract health score from an observation dict."""
|
| 22 |
+
score = observation.get("store_health_score", 0.0)
|
| 23 |
+
if score == 0.0:
|
| 24 |
+
issues_fixed = observation.get("issues_fixed", 0)
|
| 25 |
+
total_issues = observation.get("total_issues", 0)
|
| 26 |
+
if total_issues > 0:
|
| 27 |
+
score = issues_fixed / total_issues
|
| 28 |
+
return score
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Task graders — called by the OpenEnv validation pipeline
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
def grade_product_listing_qa(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
|
| 36 |
+
"""Grade the easy task: Product Listing QA (8 issues, 25 steps).
|
| 37 |
+
|
| 38 |
+
Scoring: purely based on fraction of issues resolved.
|
| 39 |
+
Easy task allows up to 0.95 ceiling.
|
| 40 |
+
"""
|
| 41 |
+
obs = sample if "store_health_score" in sample else sample.get("observation", sample)
|
| 42 |
+
raw_score = _score_from_observation(obs)
|
| 43 |
+
return safe_reward(min(raw_score, 0.95))
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def grade_seo_collection_optimization(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
|
| 47 |
+
"""Grade the medium task: SEO & Collection Optimization (12 issues, 35 steps).
|
| 48 |
+
|
| 49 |
+
Scoring: fraction of issues resolved, capped at 0.90 for medium difficulty.
|
| 50 |
+
"""
|
| 51 |
+
obs = sample if "store_health_score" in sample else sample.get("observation", sample)
|
| 52 |
+
raw_score = _score_from_observation(obs)
|
| 53 |
+
return safe_reward(min(raw_score, 0.90))
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def grade_full_store_audit(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
|
| 57 |
+
"""Grade the hard task: Full Store Audit (20 issues, 50 steps).
|
| 58 |
+
|
| 59 |
+
Scoring: fraction of issues resolved, capped at 0.85 for hard difficulty.
|
| 60 |
+
"""
|
| 61 |
+
obs = sample if "store_health_score" in sample else sample.get("observation", sample)
|
| 62 |
+
raw_score = _score_from_observation(obs)
|
| 63 |
+
return safe_reward(min(raw_score, 0.85))
|
shopify_store_audit/server/shopify_store_audit_environment.py
CHANGED
|
@@ -97,7 +97,7 @@ class ShopifyStoreAuditEnvironment(Environment):
|
|
| 97 |
issues_remaining=self._store.issues_remaining_count,
|
| 98 |
issues_fixed=0,
|
| 99 |
total_issues=self._store.total_issues_count,
|
| 100 |
-
store_health_score=0.
|
| 101 |
available_commands=ShopifyStore.AVAILABLE_COMMANDS,
|
| 102 |
task_name=self._task.task_id,
|
| 103 |
done=False,
|
|
|
|
| 97 |
issues_remaining=self._store.issues_remaining_count,
|
| 98 |
issues_fixed=0,
|
| 99 |
total_issues=self._store.total_issues_count,
|
| 100 |
+
store_health_score=0.01,
|
| 101 |
available_commands=ShopifyStore.AVAILABLE_COMMANDS,
|
| 102 |
task_name=self._task.task_id,
|
| 103 |
done=False,
|
shopify_store_audit/server/store.py
CHANGED
|
@@ -522,7 +522,7 @@ class ShopifyStore:
|
|
| 522 |
"total_issues": total,
|
| 523 |
"issues_fixed": fixed,
|
| 524 |
"issues_remaining": total - fixed,
|
| 525 |
-
"health_score":
|
| 526 |
"issues_by_category": categories,
|
| 527 |
"actionable_issues": actionable_issues,
|
| 528 |
}
|
|
@@ -1032,5 +1032,7 @@ class ShopifyStore:
|
|
| 1032 |
@property
|
| 1033 |
def health_score(self) -> float:
|
| 1034 |
if not self.issues:
|
| 1035 |
-
return
|
| 1036 |
-
|
|
|
|
|
|
|
|
|
| 522 |
"total_issues": total,
|
| 523 |
"issues_fixed": fixed,
|
| 524 |
"issues_remaining": total - fixed,
|
| 525 |
+
"health_score": self.health_score,
|
| 526 |
"issues_by_category": categories,
|
| 527 |
"actionable_issues": actionable_issues,
|
| 528 |
}
|
|
|
|
| 1032 |
@property
|
| 1033 |
def health_score(self) -> float:
|
| 1034 |
if not self.issues:
|
| 1035 |
+
return 0.99
|
| 1036 |
+
raw = self.issues_fixed_count / self.total_issues_count
|
| 1037 |
+
clamped = min(max(raw, 0.01), 0.99)
|
| 1038 |
+
return round(clamped, 4)
|