Spaces:

Mohammed-Altaf
/

DataAnalysis_Env

Sleeping

App Files Files Community

Mohammed-Altaf commited on Apr 7

Commit

38ee4ab

1 Parent(s): d7b24b0

fixed scoring and logging for final submission

Browse files

Files changed (9) hide show

helpers/logging.py +23 -9
inference.py +8 -7
openenv.yaml +13 -0
tasks/task_easy.py +2 -1
tasks/task_hard.py +1 -1
tasks/task_hard_2.py +1 -1
tasks/task_hard_3.py +1 -1
tasks/task_medium.py +1 -1
tasks/task_medium_2.py +1 -1

helpers/logging.py CHANGED Viewed

@@ -1,19 +1,33 @@
-from typing import List, Optional, Union
 def log_start(task: str, env: str, model: str) -> None:
     print(f"[START] task={task} env={env} model={model}", flush=True)
-def log_step(step: int, action: Union[dict, str], reward: float, done: bool, error: Optional[str]) -> None:
-    error_val = error if error else "null"
     done_val = str(done).lower()
-    print(
-        f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
-        flush=True,
-    )
-def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)
-    print(f"[END] success={str(success).lower()} steps={steps} score={int(score)} rewards={rewards_str}\n", flush=True)

+from typing import List, Optional
 def log_start(task: str, env: str, model: str) -> None:
+    """Emit the [START] line at episode begin."""
     print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    """Emit one [STEP] line immediately after env.step() returns.
+    Args:
+        step: 1-based step number.
+        action: Compact single-line action label (e.g. 'execute_code').
+        reward: Step reward, formatted to 2 decimal places.
+        done: Whether the episode ended after this step.
+        error: Raw error string from the env, or None.
+    """
+    error_val = error.replace("\n", " ") if error else "null"
     done_val = str(done).lower()
+    print(f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}", flush=True)
+def log_end(success: bool, steps: int, rewards: List[float]) -> None:
+    """Emit the [END] line after env.close(), always emitted even on exception.
+    Args:
+        success: Whether the episode was successful.
+        steps: Total number of steps taken.
+        rewards: List of per-step rewards, each formatted to 2 decimal places.
+    """
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(f"[END] success={str(success).lower()} steps={steps} rewards={rewards_str}", flush=True)

inference.py CHANGED Viewed

@@ -76,14 +76,14 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
                 done = exec_result.done
             except Exception as exc:
                 print(f"[DEBUG] env step failed: {exc}", flush=True)
-                log_step(step=step + 1, action=action, reward=0.0, done=False, error=str(exc))
                 rewards.append(0.0)
                 continue
             rewards.append(reward)
             error = exec_obs.error if not exec_obs.success else None
             result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
-            log_step(step=step + 1, action=action, reward=reward, done=done, error=error)
             messages.append({"role": "assistant", "content": response_text})
             messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
@@ -97,13 +97,14 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
                 score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
             except Exception as exc:
                 print(f"[DEBUG] env step failed: {exc}", flush=True)
-                log_step(step=step + 1, action=action, reward=0.0, done=True, error=str(exc))
-                log_end(success=False, steps=step + 1, score=0.0, rewards=rewards)
                 return 0.0
             rewards.append(score)
-            log_step(step=step + 1, action=action, reward=score, done=True, error=None)
-            log_end(success=score > 0.0, steps=step + 1, score=score, rewards=rewards)
             return score
         else:
@@ -127,7 +128,7 @@ def run_task(openai_client: OpenAI, env_client: Any, task_id: int) -> float:
                 }
             )
-    log_end(success=False, steps=MAX_STEPS, score=0.0, rewards=rewards)
     return 0.0

                 done = exec_result.done
             except Exception as exc:
                 print(f"[DEBUG] env step failed: {exc}", flush=True)
+                log_step(step=step + 1, action=action_type, reward=0.0, done=False, error=str(exc))
                 rewards.append(0.0)
                 continue
             rewards.append(reward)
             error = exec_obs.error if not exec_obs.success else None
             result_text = f"Output: {exec_obs.output}" if not exec_obs.error else f"Error: {exec_obs.error}"
+            log_step(step=step + 1, action=action_type, reward=reward, done=done, error=error)
             messages.append({"role": "assistant", "content": response_text})
             messages.append({"role": "user", "content": [{"type": "text", "text": result_text}]})
                 score = float(submit_obs.metadata.get("score", 0.0) if submit_obs.metadata else submit_result.reward)
             except Exception as exc:
                 print(f"[DEBUG] env step failed: {exc}", flush=True)
+                log_step(step=step + 1, action=action_type, reward=0.0, done=True, error=str(exc))
+                log_end(success=False, steps=step + 1, rewards=rewards)
                 return 0.0
+            score = max(0.01, min(0.99, score))
             rewards.append(score)
+            log_step(step=step + 1, action=action_type, reward=score, done=True, error=None)
+            log_end(success=score > 0.01, steps=step + 1, rewards=rewards)
             return score
         else:
                 }
             )
+    log_end(success=False, steps=MAX_STEPS, rewards=rewards)
     return 0.0

openenv.yaml CHANGED Viewed

@@ -6,3 +6,16 @@ type: space
 runtime: fastapi
 app: server.app:app
 port: 8000

 runtime: fastapi
 app: server.app:app
 port: 8000
+tasks:
+  - id: task_easy
+    grader: tasks.task_easy:TopRevenueCategoryTask
+  - id: task_medium
+    grader: tasks.task_medium:CityRevenueShareTask
+  - id: task_medium_2
+    grader: tasks.task_medium_2:MonthlyRevenueRatioTask
+  - id: task_hard
+    grader: tasks.task_hard:RepeatCustomerCohortTask
+  - id: task_hard_2
+    grader: tasks.task_hard_2:CustomerLoyaltyRevenueTask
+  - id: task_hard_3
+    grader: tasks.task_hard_3:SupplierProfitabilityTask

tasks/task_easy.py CHANGED Viewed

@@ -49,4 +49,5 @@ class TopRevenueCategoryTask(BaseTask):
         """
         expected = self.expected_answer().strip().lower()
         submitted = answer.strip().lower()
-        return 1.0 if expected in submitted else 0.0

         """
         expected = self.expected_answer().strip().lower()
         submitted = answer.strip().lower()
+        raw = 1.0 if expected in submitted else 0.0
+        return max(0.01, min(0.99, raw))

tasks/task_hard.py CHANGED Viewed

@@ -100,4 +100,4 @@ class RepeatCustomerCohortTask(BaseTask):
             except ValueError:
                 pass
-        return score

             except ValueError:
                 pass
+        return max(0.01, min(0.99, score))

tasks/task_hard_2.py CHANGED Viewed

@@ -100,4 +100,4 @@ class CustomerLoyaltyRevenueTask(BaseTask):
             except ValueError:
                 pass
-        return score

             except ValueError:
                 pass
+        return max(0.01, min(0.99, score))

tasks/task_hard_3.py CHANGED Viewed

@@ -104,4 +104,4 @@ class SupplierProfitabilityTask(BaseTask):
             except ValueError:
                 pass
-        return score

             except ValueError:
                 pass
+        return max(0.01, min(0.99, score))

tasks/task_medium.py CHANGED Viewed

@@ -74,4 +74,4 @@ class CityRevenueShareTask(BaseTask):
             except ValueError:
                 pass
-        return score

             except ValueError:
                 pass
+        return max(0.01, min(0.99, score))

tasks/task_medium_2.py CHANGED Viewed

@@ -85,4 +85,4 @@ class MonthlyRevenueRatioTask(BaseTask):
             except ValueError:
                 pass
-        return score

             except ValueError:
                 pass
+        return max(0.01, min(0.99, score))