Spaces:

devaatmik
/

shopify-store-audit

Sleeping

aatmk-panse commited on Apr 11

Commit

c5968b0

1 Parent(s): 50cd687

Add 3 tasks with graders and clamp scores to (0, 1)

- Add tasks section to openenv.yaml with grader references for all 3 tasks
(product_listing_qa, seo_collection_optimization, full_store_audit)
- Create server/graders.py with grade functions and safe_reward clamping
- Clamp health_score to [0.01, 0.99] in store.py, environment, and inference
- Fixes "Not enough tasks with graders" and "scores out of range" validation

Files changed (5) hide show

shopify_store_audit/inference.py +2 -2
shopify_store_audit/openenv.yaml +18 -0
shopify_store_audit/server/graders.py +63 -0
shopify_store_audit/server/shopify_store_audit_environment.py +1 -1
shopify_store_audit/server/store.py +5 -3

shopify_store_audit/inference.py CHANGED Viewed

@@ -229,8 +229,8 @@ async def main() -> None:
             if done:
                 break
-        score = obs.store_health_score if obs else 0.0
-        score = min(max(score, 0.0), 1.0)
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:

             if done:
                 break
+        score = obs.store_health_score if obs else 0.01
+        score = min(max(score, 0.01), 0.99)
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:

shopify_store_audit/openenv.yaml CHANGED Viewed

@@ -1,6 +1,24 @@
 spec_version: 1
 name: shopify_store_audit
 type: space
 runtime: fastapi
 app: server.app:app
 port: 8000

 spec_version: 1
 name: shopify_store_audit
+description: >
+  Simulated Shopify store audit environment where an AI agent discovers and
+  fixes real e-commerce issues across product data, SEO, inventory, collections,
+  and orders. 3 tasks (easy/medium/hard) with graded difficulty.
 type: space
 runtime: fastapi
 app: server.app:app
 port: 8000
+tasks:
+  - id: product_listing_qa
+    description: "8 product data issues — missing descriptions, broken pricing, absent images, draft products."
+    grader: server.graders.grade_product_listing_qa
+    difficulty: easy
+  - id: seo_collection_optimization
+    description: "12 SEO and catalog issues — missing SEO metadata, empty collections, broken rules, missing alt text."
+    grader: server.graders.grade_seo_collection_optimization
+    difficulty: medium
+  - id: full_store_audit
+    description: "20 cross-cutting issues — product data, SEO, inventory, collections, orders, and metafields."
+    grader: server.graders.grade_full_store_audit
+    difficulty: hard

shopify_store_audit/server/graders.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Graders for the Shopify Store Audit environment.
+Each grader evaluates the agent's performance on a specific task by analysing
+the final observation returned by the environment. Scores are clamped to the
+open interval (0.01, 0.99) to satisfy OpenEnv validation constraints that
+require scores strictly between 0 and 1.
+"""
+from __future__ import annotations
+from typing import Any, Dict
+def safe_reward(raw: float) -> float:
+    """Clamp reward strictly between 0.01 and 0.99."""
+    return round(min(max(float(raw), 0.01), 0.99), 4)
+def _score_from_observation(observation: Dict[str, Any]) -> float:
+    """Extract health score from an observation dict."""
+    score = observation.get("store_health_score", 0.0)
+    if score == 0.0:
+        issues_fixed = observation.get("issues_fixed", 0)
+        total_issues = observation.get("total_issues", 0)
+        if total_issues > 0:
+            score = issues_fixed / total_issues
+    return score
+# ---------------------------------------------------------------------------
+# Task graders — called by the OpenEnv validation pipeline
+# ---------------------------------------------------------------------------
+def grade_product_listing_qa(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
+    """Grade the easy task: Product Listing QA (8 issues, 25 steps).
+    Scoring: purely based on fraction of issues resolved.
+    Easy task allows up to 0.95 ceiling.
+    """
+    obs = sample if "store_health_score" in sample else sample.get("observation", sample)
+    raw_score = _score_from_observation(obs)
+    return safe_reward(min(raw_score, 0.95))
+def grade_seo_collection_optimization(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
+    """Grade the medium task: SEO & Collection Optimization (12 issues, 35 steps).
+    Scoring: fraction of issues resolved, capped at 0.90 for medium difficulty.
+    """
+    obs = sample if "store_health_score" in sample else sample.get("observation", sample)
+    raw_score = _score_from_observation(obs)
+    return safe_reward(min(raw_score, 0.90))
+def grade_full_store_audit(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
+    """Grade the hard task: Full Store Audit (20 issues, 50 steps).
+    Scoring: fraction of issues resolved, capped at 0.85 for hard difficulty.
+    """
+    obs = sample if "store_health_score" in sample else sample.get("observation", sample)
+    raw_score = _score_from_observation(obs)
+    return safe_reward(min(raw_score, 0.85))

shopify_store_audit/server/shopify_store_audit_environment.py CHANGED Viewed

@@ -97,7 +97,7 @@ class ShopifyStoreAuditEnvironment(Environment):
             issues_remaining=self._store.issues_remaining_count,
             issues_fixed=0,
             total_issues=self._store.total_issues_count,
-            store_health_score=0.0,
             available_commands=ShopifyStore.AVAILABLE_COMMANDS,
             task_name=self._task.task_id,
             done=False,

             issues_remaining=self._store.issues_remaining_count,
             issues_fixed=0,
             total_issues=self._store.total_issues_count,
+            store_health_score=0.01,
             available_commands=ShopifyStore.AVAILABLE_COMMANDS,
             task_name=self._task.task_id,
             done=False,

shopify_store_audit/server/store.py CHANGED Viewed

@@ -522,7 +522,7 @@ class ShopifyStore:
             "total_issues": total,
             "issues_fixed": fixed,
             "issues_remaining": total - fixed,
-            "health_score": round(fixed / total, 4) if total > 0 else 1.0,
             "issues_by_category": categories,
             "actionable_issues": actionable_issues,
         }
@@ -1032,5 +1032,7 @@ class ShopifyStore:
     @property
     def health_score(self) -> float:
         if not self.issues:
-            return 1.0
-        return round(self.issues_fixed_count / self.total_issues_count, 4)

             "total_issues": total,
             "issues_fixed": fixed,
             "issues_remaining": total - fixed,
+            "health_score": self.health_score,
             "issues_by_category": categories,
             "actionable_issues": actionable_issues,
         }
     @property
     def health_score(self) -> float:
         if not self.issues:
+            return 0.99
+        raw = self.issues_fixed_count / self.total_issues_count
+        clamped = min(max(raw, 0.01), 0.99)
+        return round(clamped, 4)