aatmk-panse commited on
Commit
c5968b0
·
1 Parent(s): 50cd687

Add 3 tasks with graders and clamp scores to (0, 1)

Browse files

- Add tasks section to openenv.yaml with grader references for all 3 tasks
(product_listing_qa, seo_collection_optimization, full_store_audit)
- Create server/graders.py with grade functions and safe_reward clamping
- Clamp health_score to [0.01, 0.99] in store.py, environment, and inference
- Fixes "Not enough tasks with graders" and "scores out of range" validation

shopify_store_audit/inference.py CHANGED
@@ -229,8 +229,8 @@ async def main() -> None:
229
  if done:
230
  break
231
 
232
- score = obs.store_health_score if obs else 0.0
233
- score = min(max(score, 0.0), 1.0)
234
  success = score >= SUCCESS_SCORE_THRESHOLD
235
 
236
  finally:
 
229
  if done:
230
  break
231
 
232
+ score = obs.store_health_score if obs else 0.01
233
+ score = min(max(score, 0.01), 0.99)
234
  success = score >= SUCCESS_SCORE_THRESHOLD
235
 
236
  finally:
shopify_store_audit/openenv.yaml CHANGED
@@ -1,6 +1,24 @@
1
  spec_version: 1
2
  name: shopify_store_audit
 
 
 
 
3
  type: space
4
  runtime: fastapi
5
  app: server.app:app
6
  port: 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  spec_version: 1
2
  name: shopify_store_audit
3
+ description: >
4
+ Simulated Shopify store audit environment where an AI agent discovers and
5
+ fixes real e-commerce issues across product data, SEO, inventory, collections,
6
+ and orders. 3 tasks (easy/medium/hard) with graded difficulty.
7
  type: space
8
  runtime: fastapi
9
  app: server.app:app
10
  port: 8000
11
+
12
+ tasks:
13
+ - id: product_listing_qa
14
+ description: "8 product data issues — missing descriptions, broken pricing, absent images, draft products."
15
+ grader: server.graders.grade_product_listing_qa
16
+ difficulty: easy
17
+ - id: seo_collection_optimization
18
+ description: "12 SEO and catalog issues — missing SEO metadata, empty collections, broken rules, missing alt text."
19
+ grader: server.graders.grade_seo_collection_optimization
20
+ difficulty: medium
21
+ - id: full_store_audit
22
+ description: "20 cross-cutting issues — product data, SEO, inventory, collections, orders, and metafields."
23
+ grader: server.graders.grade_full_store_audit
24
+ difficulty: hard
shopify_store_audit/server/graders.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Graders for the Shopify Store Audit environment.
3
+
4
+ Each grader evaluates the agent's performance on a specific task by analysing
5
+ the final observation returned by the environment. Scores are clamped to the
6
+ open interval (0.01, 0.99) to satisfy OpenEnv validation constraints that
7
+ require scores strictly between 0 and 1.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from typing import Any, Dict
13
+
14
+
15
+ def safe_reward(raw: float) -> float:
16
+ """Clamp reward strictly between 0.01 and 0.99."""
17
+ return round(min(max(float(raw), 0.01), 0.99), 4)
18
+
19
+
20
+ def _score_from_observation(observation: Dict[str, Any]) -> float:
21
+ """Extract health score from an observation dict."""
22
+ score = observation.get("store_health_score", 0.0)
23
+ if score == 0.0:
24
+ issues_fixed = observation.get("issues_fixed", 0)
25
+ total_issues = observation.get("total_issues", 0)
26
+ if total_issues > 0:
27
+ score = issues_fixed / total_issues
28
+ return score
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Task graders — called by the OpenEnv validation pipeline
33
+ # ---------------------------------------------------------------------------
34
+
35
+ def grade_product_listing_qa(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
36
+ """Grade the easy task: Product Listing QA (8 issues, 25 steps).
37
+
38
+ Scoring: purely based on fraction of issues resolved.
39
+ Easy task allows up to 0.95 ceiling.
40
+ """
41
+ obs = sample if "store_health_score" in sample else sample.get("observation", sample)
42
+ raw_score = _score_from_observation(obs)
43
+ return safe_reward(min(raw_score, 0.95))
44
+
45
+
46
+ def grade_seo_collection_optimization(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
47
+ """Grade the medium task: SEO & Collection Optimization (12 issues, 35 steps).
48
+
49
+ Scoring: fraction of issues resolved, capped at 0.90 for medium difficulty.
50
+ """
51
+ obs = sample if "store_health_score" in sample else sample.get("observation", sample)
52
+ raw_score = _score_from_observation(obs)
53
+ return safe_reward(min(raw_score, 0.90))
54
+
55
+
56
+ def grade_full_store_audit(sample: Dict[str, Any], item: Dict[str, Any] | None = None) -> float:
57
+ """Grade the hard task: Full Store Audit (20 issues, 50 steps).
58
+
59
+ Scoring: fraction of issues resolved, capped at 0.85 for hard difficulty.
60
+ """
61
+ obs = sample if "store_health_score" in sample else sample.get("observation", sample)
62
+ raw_score = _score_from_observation(obs)
63
+ return safe_reward(min(raw_score, 0.85))
shopify_store_audit/server/shopify_store_audit_environment.py CHANGED
@@ -97,7 +97,7 @@ class ShopifyStoreAuditEnvironment(Environment):
97
  issues_remaining=self._store.issues_remaining_count,
98
  issues_fixed=0,
99
  total_issues=self._store.total_issues_count,
100
- store_health_score=0.0,
101
  available_commands=ShopifyStore.AVAILABLE_COMMANDS,
102
  task_name=self._task.task_id,
103
  done=False,
 
97
  issues_remaining=self._store.issues_remaining_count,
98
  issues_fixed=0,
99
  total_issues=self._store.total_issues_count,
100
+ store_health_score=0.01,
101
  available_commands=ShopifyStore.AVAILABLE_COMMANDS,
102
  task_name=self._task.task_id,
103
  done=False,
shopify_store_audit/server/store.py CHANGED
@@ -522,7 +522,7 @@ class ShopifyStore:
522
  "total_issues": total,
523
  "issues_fixed": fixed,
524
  "issues_remaining": total - fixed,
525
- "health_score": round(fixed / total, 4) if total > 0 else 1.0,
526
  "issues_by_category": categories,
527
  "actionable_issues": actionable_issues,
528
  }
@@ -1032,5 +1032,7 @@ class ShopifyStore:
1032
  @property
1033
  def health_score(self) -> float:
1034
  if not self.issues:
1035
- return 1.0
1036
- return round(self.issues_fixed_count / self.total_issues_count, 4)
 
 
 
522
  "total_issues": total,
523
  "issues_fixed": fixed,
524
  "issues_remaining": total - fixed,
525
+ "health_score": self.health_score,
526
  "issues_by_category": categories,
527
  "actionable_issues": actionable_issues,
528
  }
 
1032
  @property
1033
  def health_score(self) -> float:
1034
  if not self.issues:
1035
+ return 0.99
1036
+ raw = self.issues_fixed_count / self.total_issues_count
1037
+ clamped = min(max(raw, 0.01), 0.99)
1038
+ return round(clamped, 4)