Spaces:

TwoBraincells
/

Elite-Trade-Sentry

Sleeping

App Files Files Community

SamaKool commited on Apr 12

Commit

bde1135

2 Parent(s): d28a5ce e209e50

moved code into all the grader flies from just one file and fixed the import name 'FinAuditorGrader' which was used for every task difficulty

Browse files

Files changed (16) hide show

.gitignore +1 -0
graders/__init__.py +12 -10
graders/grader_classification.py +22 -95
graders/grader_detection.py +13 -153
graders/grader_fix.py +22 -312
hf auditor/openenv_fin_auditor.egg-info/PKG-INFO +19 -0
hf auditor/openenv_fin_auditor.egg-info/SOURCES.txt +23 -0
hf auditor/openenv_fin_auditor.egg-info/dependency_links.txt +1 -0
hf auditor/openenv_fin_auditor.egg-info/entry_points.txt +2 -0
hf auditor/openenv_fin_auditor.egg-info/requires.txt +15 -0
hf auditor/openenv_fin_auditor.egg-info/top_level.txt +1 -0
server/app.py +88 -51
server/fin_auditor_environment.py +31 -4
tasks/task1_easy.py +4 -2
tasks/task2_medium.py +4 -2
tasks/task3_hard.py +4 -2

.gitignore CHANGED Viewed

@@ -25,3 +25,4 @@ __pycache__/
 .venv/
 venv/
 .envlogs/

 .venv/
 venv/
 .envlogs/
+.git_tokens

graders/__init__.py CHANGED Viewed

@@ -1,16 +1,18 @@
-"""Graders package for OpenEnv environments.
 Exports
 -------
-    FinAuditorGrader  — HFT Auditor: asymmetric TP/FP/FN weighting (in grader_detection)
-    FeasibilityGrader — Task 1: binary feasible / infeasible
-    ConflictGrader    — Task 2: 5-class constraint-violation classification
-    RepairGrader      — Task 3: multi-component schedule repair
 """
-from graders.grader_detection import FeasibilityGrader, FinAuditorGrader
-from graders.grader_classification import ConflictGrader
-from graders.grader_fix import RepairGrader
-__all__ = ["FinAuditorGrader", "FeasibilityGrader", "ConflictGrader", "RepairGrader"]

+"""Graders package for Elite-Trade-Sentry HFT environments.
 Exports
 -------
+    EasyDetectionGrader        - Task 1: Forgiving penalties (0.1 FP / 0.2 FN).
+    MediumClassificationGrader - Task 2: Standard HFT penalties (0.2 FP / 0.4 FN).
+    HardFixGrader              - Task 3: Brutal adversarial penalties (0.4 FP / 0.8 FN).
 """
+from graders.grader_detection import EasyDetectionGrader
+from graders.grader_classification import MediumClassificationGrader
+from graders.grader_fix import HardFixGrader
+__all__ = [
+    "EasyDetectionGrader",
+    "MediumClassificationGrader",
+    "HardFixGrader"
+]

graders/grader_classification.py CHANGED Viewed

@@ -1,107 +1,34 @@
-"""Grader for Task 2 — Conflict Classification (medium).
-Scoring
--------
-    1.0  — exact match with the ground-truth violation type
-    0.5  — same constraint family (resource-limit or temporal-ordering)
-    0.1  — valid category but from a different family
-    0.0  — empty or completely unrecognised response
-Constraint families (related groups for partial credit)
--------------------------------------------------------
-    Resource-limit family : resource_overload, capacity_exceeded
-        Both concern the number of jobs concurrently on a machine.
-    Temporal-ordering family : deadline_violation, precedence_violation
-        Both concern the sequencing and timing of job execution.
-    Standalone : availability_conflict
-        Concerns machine operational windows (no close sibling).
-After each call, ``last_breakdown`` holds a dict describing the decision.
-"""
 from __future__ import annotations
 from typing import Any
-from models import Action
-VALID_CATEGORIES: frozenset[str] = frozenset(
-    {
-        "resource_overload",
-        "deadline_violation",
-        "precedence_violation",
-        "availability_conflict",
-        "capacity_exceeded",
-    }
-)
-# Groups of semantically related categories; membership earns partial credit.
-_RELATED_GROUPS: list[frozenset[str]] = [
-    frozenset({"resource_overload", "capacity_exceeded"}),    # resource-limit family
-    frozenset({"deadline_violation", "precedence_violation"}), # temporal-ordering family
-]
-def _same_family(a: str, b: str) -> bool:
-    """Return True if a and b belong to the same related group."""
-    return any(a in g and b in g for g in _RELATED_GROUPS)
-class ConflictGrader:
-    """Grade the agent's constraint-violation classification."""
     def __init__(self) -> None:
         self.last_breakdown: dict[str, Any] = {}
-    def grade(self, action: Action, ground_truth: dict[str, Any]) -> float:
-        # Normalise to snake_case (agents often write "deadline violation" etc.)
-        response: str = (
-            action.response.strip().lower().replace(" ", "_").replace("-", "_")
-        )
-        expected: str = ground_truth.get("violation_type") or ""
-        if not response:
-            self._record("", expected, 0.0, "Empty response.")
-            return 0.0
-        # Exact match
-        if response == expected:
-            self._record(response, expected, 1.0, "Exact match.")
-            return 1.0
-        # Not in vocabulary
-        if response not in VALID_CATEGORIES:
-            self._record(
-                response, expected, 0.0,
-                f"'{response}' is not a valid category. "
-                f"Choose from: {', '.join(sorted(VALID_CATEGORIES))}.",
-            )
-            return 0.0
-        # Same constraint family → partial credit
-        if _same_family(response, expected):
-            self._record(
-                response, expected, 0.5,
-                f"Related category (same family as '{expected}').",
-            )
-            return 0.5
-        # Valid but different family
-        self._record(
-            response, expected, 0.1,
-            f"Valid category but wrong family. Expected '{expected}'.",
-        )
-        return 0.1
-    def _record(
-        self, predicted: str, expected: str, score: float, feedback: str
-    ) -> None:
-        self.last_breakdown = {
-            "predicted": predicted,
-            "expected": expected,
-            "score": score,
-            "in_valid_categories": predicted in VALID_CATEGORIES,
-            "same_family": _same_family(predicted, expected) if predicted and expected else False,
-            "exact_match": predicted == expected,
-            "feedback": feedback,
-        }

 from __future__ import annotations
 from typing import Any
+# MEDIUM MODE: Standard HFT penalties.
+_TP_WEIGHT = 1.0
+_TN_WEIGHT = 0.1
+_FP_PENALTY = 0.2 # Stricter false positive
+_FN_PENALTY = 0.4 # Standard catastrophic failure penalty
+class MediumClassificationGrader:
+    """Grader for Task 2: Conflict Classification repurposed for HFT (Medium)."""
     def __init__(self) -> None:
         self.last_breakdown: dict[str, Any] = {}
+    def grade(self, state: Any, ground_truth: dict[str, Any] | None = None) -> float:
+        tp = float(getattr(state, "total_tp", 0))
+        tn = float(getattr(state, "total_tn", 0))
+        fp = float(getattr(state, "total_fp", 0))
+        fn = float(getattr(state, "total_fn", 0))
+        total = tp + tn + fp + fn
+        if total == 0:
+            return 0.01
+        positive_signal = (tp * _TP_WEIGHT) + (tn * _TN_WEIGHT)
+        negative_signal = (fp * _FP_PENALTY) + (fn * _FN_PENALTY)
+        max_signal = total * _TP_WEIGHT
+        raw_score = max(0.0, positive_signal - negative_signal) / max_signal
+        score = max(0.01, min(0.99, raw_score))
+        self.last_breakdown = {"tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn), "score": score}
+        return score

graders/grader_detection.py CHANGED Viewed

@@ -1,143 +1,26 @@
-"""Graders for detection-class tasks.
-FeasibilityGrader
------------------
-    Grades Task 1 — binary feasible / infeasible schedule check.
-    Scores: 1.0 exact match | 0.1 wrong answer | 0.0 empty.
-FinAuditorGrader
-----------------
-    Grades HFT Auditor episodes using C++ ReconciliationEngine metrics.
-    Called by OpenEnv automatically when done=True.
-    Score formula: asymmetric TP/FP/FN weighting, clamped strictly to [0.01, 0.99].
-"""
 from __future__ import annotations
 from typing import Any
-from models import Action
-# Words treated as equivalent to "feasible"
-_FEASIBLE_WORDS: frozenset[str] = frozenset(
-    {"feasible", "valid", "correct", "satisfiable", "yes", "ok", "pass"}
-)
-# Words treated as equivalent to "infeasible"
-_INFEASIBLE_WORDS: frozenset[str] = frozenset(
-    {
-        "infeasible", "invalid", "incorrect", "unsatisfiable", "no",
-        "violated", "conflict", "fail", "impossible", "broken",
-    }
-)
-class FeasibilityGrader:
-    """Grade whether the agent correctly determined schedule feasibility."""
-    def __init__(self) -> None:
-        # Populated after each call to grade(); surfaced in env info dict.
-        self.last_breakdown: dict[str, Any] = {}
-    def grade(self, action: Action, ground_truth: dict[str, Any]) -> float:
-        response: str = action.response.strip().lower()
-        is_feasible: bool = ground_truth.get("is_feasible", False)
-        expected: str = "feasible" if is_feasible else "infeasible"
-        # Empty response → no signal
-        if not response:
-            self.last_breakdown = {
-                "predicted": "",
-                "expected": expected,
-                "correct": False,
-                "feedback": "Empty response — reply with 'feasible' or 'infeasible'.",
-            }
-            return 0.0
-        # Normalise response to canonical form
-        if response in _FEASIBLE_WORDS:
-            predicted = "feasible"
-        elif response in _INFEASIBLE_WORDS:
-            predicted = "infeasible"
-        else:
-            # Recognisable attempt but could not be parsed cleanly
-            self.last_breakdown = {
-                "predicted": response,
-                "expected": expected,
-                "correct": False,
-                "feedback": (
-                    f"Could not parse '{response}'. "
-                    "Use exactly 'feasible' or 'infeasible'."
-                ),
-            }
-            return 0.1
-        correct = predicted == expected
-        self.last_breakdown = {
-            "predicted": predicted,
-            "expected": expected,
-            "correct": correct,
-            "feedback": (
-                "Correct."
-                if correct
-                else f"Wrong — the schedule is {expected}, not {predicted}."
-            ),
-        }
-        # Exact match → 1.0; wrong normalised answer → 0.1 (keeps gradient signal)
-        return 1.0 if correct else 0.1
-# ── HFT Auditor Grader ────────────────────────────────────────────────────────
-# Asymmetric reward weights matching the C++ ReconciliationEngine constants
-_TP_WEIGHT: float = 1.0   # correctly flagged anomaly — full credit
-_TN_WEIGHT: float = 0.1   # correctly passed valid trade — small positive
-_FP_PENALTY: float = 0.1  # flagged a valid trade — minor penalty
-_FN_PENALTY: float = 0.4  # missed an anomaly — severe penalty
-class FinAuditorGrader:
-    """Grade a completed HFT audit episode from C++ engine metrics.
-    Called by OpenEnv automatically when ``done=True`` is returned by
-    ``FinAuditorEnvironment.step()``.
-    The score is computed from the cumulative confusion-matrix counters
-    accumulated across the full episode by the C++ ReconciliationEngine:
-        last_tp — True Positives  (anomalous trade correctly flagged)
-        last_tn — True Negatives  (valid trade correctly passed)
-        last_fp — False Positives (valid trade wrongly flagged)
-        last_fn — False Negatives (anomalous trade missed — catastrophic)
-    Hackathon rule: final score is strictly clamped to [0.01, 0.99].
-    """
     def __init__(self) -> None:
         self.last_breakdown: dict[str, Any] = {}
     def grade(self, state: Any, ground_truth: dict[str, Any] | None = None) -> float:
-        """Compute the final episode score.
-        Reads cumulative ``total_*`` counters (full episode) when available,
-        falling back to ``last_*`` (single-batch snapshot) for compatibility.
-        Args:
-            state:        Environment state object at episode end.
-            ground_truth: Unused — truth is implicit in the C++ engine.
-        Returns:
-            float strictly in (0.01, 0.99).
-        """
-        # Prefer full-episode accumulators; fall back to last-batch snapshot
-        tp = float(getattr(state, "total_tp", None) or getattr(state, "last_tp", 0))
-        tn = float(getattr(state, "total_tn", None) or getattr(state, "last_tn", 0))
-        fp = float(getattr(state, "total_fp", None) or getattr(state, "last_fp", 0))
-        fn = float(getattr(state, "total_fn", None) or getattr(state, "last_fn", 0))
         total = tp + tn + fp + fn
         if total == 0:
-            self._record(tp, tn, fp, fn, 0.01, "No trades evaluated — floor score.")
             return 0.01
         positive_signal = (tp * _TP_WEIGHT) + (tn * _TN_WEIGHT)
@@ -147,28 +30,5 @@ class FinAuditorGrader:
         raw_score = max(0.0, positive_signal - negative_signal) / max_signal
         score = max(0.01, min(0.99, raw_score))
-        self._record(
-            tp, tn, fp, fn, score,
-            f"tp={int(tp)} tn={int(tn)} fp={int(fp)} fn={int(fn)} | raw={raw_score:.4f}"
-        )
-        return score
-    def _record(
-        self,
-        tp: float, tn: float, fp: float, fn: float,
-        score: float, feedback: str,
-    ) -> None:
-        total = tp + tn + fp + fn
-        self.last_breakdown = {
-            "tp": int(tp),
-            "tn": int(tn),
-            "fp": int(fp),
-            "fn": int(fn),
-            "total": int(total),
-            "precision": round(tp / (tp + fp), 4) if (tp + fp) > 0 else 0.0,
-            "recall":    round(tp / (tp + fn), 4) if (tp + fn) > 0 else 0.0,
-            "score": round(score, 4),
-            "feedback": feedback,
-        }
-        print(f"[GRADER] Episode scored: {feedback} => {score:.4f}")

 from __future__ import annotations
 from typing import Any
+# EASY MODE: Forgiving penalties.
+_TP_WEIGHT = 1.0
+_TN_WEIGHT = 0.1
+_FP_PENALTY = 0.1
+_FN_PENALTY = 0.2
+class EasyDetectionGrader:
+    """Grader for Task 1: Anomaly Detection (Easy)."""
     def __init__(self) -> None:
         self.last_breakdown: dict[str, Any] = {}
     def grade(self, state: Any, ground_truth: dict[str, Any] | None = None) -> float:
+        tp = float(getattr(state, "total_tp", 0))
+        tn = float(getattr(state, "total_tn", 0))
+        fp = float(getattr(state, "total_fp", 0))
+        fn = float(getattr(state, "total_fn", 0))
         total = tp + tn + fp + fn
         if total == 0:
             return 0.01
         positive_signal = (tp * _TP_WEIGHT) + (tn * _TN_WEIGHT)
         raw_score = max(0.0, positive_signal - negative_signal) / max_signal
         score = max(0.01, min(0.99, raw_score))
+        self.last_breakdown = {"tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn), "score": score}
+        return score

graders/grader_fix.py CHANGED Viewed

@@ -1,324 +1,34 @@
-"""Grader for Task 3 — Schedule Repair (hard).
-Scoring breakdown (additive, max 1.0)
---------------------------------------
-    0.20  — response is parseable JSON
-    0.20  — JSON has the required schema (assignments list, all jobs covered)
-    0.40  — schedule satisfies all constraints (0.10 per category):
-              capacity, deadlines, precedence, availability
-    0.20  — makespan within 30% of optimal (0.10 partial if within 60%)
-Partial-progress signal
------------------------
-Even a structurally invalid JSON attempt earns 0.0 (wrong format).
-A parseable but schema-invalid JSON earns 0.20 (gave a JSON object).
-A valid schema with partial constraint satisfaction earns up to 0.80.
-This dense reward curve supports multi-step improvement within an episode.
-After each call, ``last_breakdown`` holds a full dict with per-category
-pass/fail flags, makespan, and the optimality ratio — surfaced in the
-environment's info dict.
-"""
 from __future__ import annotations
-import json
-import re
 from typing import Any
-from models import Action
-class RepairGrader:
-    """Grade the agent's proposed schedule repair."""
     def __init__(self) -> None:
         self.last_breakdown: dict[str, Any] = {}
-    def grade(self, action: Action, ground_truth: dict[str, Any]) -> float:
-        response: str = action.response.strip()
-        instance: dict[str, Any] = ground_truth.get("instance", {})
-        optimal_makespan: int = int(ground_truth.get("optimal_makespan", 1) or 1)
-        if not response:
-            self._record_breakdown(
-                json_ok=False, schema_ok=False,
-                constraint_detail={}, makespan=0,
-                optimal_makespan=optimal_makespan,
-            )
-            return 0.0
-        score = 0.0
-        # ------------------------------------------------------------------
-        # Component 1a — Is the response parseable JSON? (0.20)
-        # ------------------------------------------------------------------
-        parsed = self._parse_json(response)
-        if parsed is None:
-            self._record_breakdown(
-                json_ok=False, schema_ok=False,
-                constraint_detail={}, makespan=0,
-                optimal_makespan=optimal_makespan,
-            )
-            return 0.0  # not JSON → no partial credit at all
-        score += 0.20  # JSON parseable
-        # ------------------------------------------------------------------
-        # Component 1b — Does it have the required schema? (0.20)
-        # Required: {"assignments": [{"job_id", "machine_id", "start_time"}, ...]}
-        # All jobs from the instance must be present exactly once.
-        # ------------------------------------------------------------------
-        assignments: list[Any] = parsed.get("assignments", [])
-        schema_ok = self._valid_schema(assignments, instance)
-        if not schema_ok:
-            self._record_breakdown(
-                json_ok=True, schema_ok=False,
-                constraint_detail={}, makespan=0,
-                optimal_makespan=optimal_makespan,
-            )
-            return round(score, 4)  # only 0.20
-        score += 0.20  # valid schema
-        # ------------------------------------------------------------------
-        # Component 2 — Constraint satisfaction (0.40, 0.10 per category)
-        # Categories: capacity, deadlines, precedence, availability
-        # ------------------------------------------------------------------
-        constraint_detail = self._check_constraints_detail(assignments, instance)
-        satisfied = sum(constraint_detail.values())
-        score += 0.40 * (satisfied / max(len(constraint_detail), 1))
-        # ------------------------------------------------------------------
-        # Component 3 — Makespan optimality (0.20)
-        # Full 0.20 if makespan ≤ optimal × 1.30; partial 0.10 if ≤ 1.60.
-        # ------------------------------------------------------------------
-        makespan = self._compute_makespan(assignments, instance)
-        if makespan > 0 and optimal_makespan > 0:
-            ratio = makespan / optimal_makespan
-            if ratio <= 1.30:
-                score += 0.20
-            elif ratio <= 1.60:
-                score += 0.10  # partial optimality credit
-        self._record_breakdown(
-            json_ok=True, schema_ok=True,
-            constraint_detail=constraint_detail,
-            makespan=makespan,
-            optimal_makespan=optimal_makespan,
-        )
-        return round(max(0.0, min(1.0, score)), 4)
-    # ------------------------------------------------------------------
-    # Breakdown recording
-    # ------------------------------------------------------------------
-    def _record_breakdown(
-        self,
-        json_ok: bool,
-        schema_ok: bool,
-        constraint_detail: dict[str, bool],
-        makespan: int,
-        optimal_makespan: int,
-    ) -> None:
-        ratio = (
-            round(makespan / optimal_makespan, 3)
-            if (makespan > 0 and optimal_makespan > 0)
-            else None
-        )
-        self.last_breakdown = {
-            "json_parseable": json_ok,
-            "schema_valid": schema_ok,
-            "constraints": constraint_detail,
-            "constraints_satisfied": sum(constraint_detail.values()) if constraint_detail else 0,
-            "makespan": makespan,
-            "optimal_makespan": optimal_makespan,
-            "makespan_ratio": ratio,
-            "within_30pct": ratio is not None and ratio <= 1.30,
-        }
-    # ------------------------------------------------------------------
-    # JSON parsing — robust to markdown fences and partial wrapping
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _parse_json(response: str) -> dict[str, Any] | None:
-        """Try multiple strategies to extract a JSON object from the response.
-        Strategy 1: Direct json.loads (agent returned pure JSON).
-        Strategy 2: Strip markdown code fences, then parse.
-        Strategy 3: Brace-counting to find the outermost {...} block.
-                    This is the most robust and handles agents that wrap JSON
-                    in prose like "Here is my answer: {...}".
-        """
-        # Strategy 1 — direct parse
-        try:
-            obj = json.loads(response)
-            return obj if isinstance(obj, dict) else None
-        except (json.JSONDecodeError, ValueError):
-            pass
-        # Strategy 2 — strip code fences
-        stripped = re.sub(r"```(?:json)?", "", response).replace("```", "").strip()
-        try:
-            obj = json.loads(stripped)
-            return obj if isinstance(obj, dict) else None
-        except (json.JSONDecodeError, ValueError):
-            pass
-        # Strategy 3 — brace-counting for the outermost { ... }
-        start = response.find("{")
-        if start == -1:
-            return None
-        depth = 0
-        for i, ch in enumerate(response[start:], start):
-            if ch == "{":
-                depth += 1
-            elif ch == "}":
-                depth -= 1
-                if depth == 0:
-                    candidate = response[start : i + 1]
-                    try:
-                        obj = json.loads(candidate)
-                        return obj if isinstance(obj, dict) else None
-                    except (json.JSONDecodeError, ValueError):
-                        return None
-        return None
-    # ------------------------------------------------------------------
-    # Schema validation
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _valid_schema(
-        assignments: list[Any], instance: dict[str, Any]
-    ) -> bool:
-        """Validate that assignments is a well-formed list covering all jobs."""
-        if not isinstance(assignments, list) or len(assignments) == 0:
-            return False
-        required_keys = {"job_id", "machine_id", "start_time"}
-        for a in assignments:
-            if not isinstance(a, dict):
-                return False
-            if not required_keys.issubset(a.keys()):
-                return False
-            if not isinstance(a.get("start_time"), (int, float)):
-                return False
-            if a.get("start_time") < 0:
-                return False  # negative start times are never valid
-        # Every job in the instance must appear exactly once
-        expected_jobs = {j["id"] for j in instance.get("jobs", [])}
-        assigned_jobs = [a["job_id"] for a in assignments]
-        return set(assigned_jobs) == expected_jobs and len(assigned_jobs) == len(expected_jobs)
-    # ------------------------------------------------------------------
-    # Constraint checking (returns per-category bool dict)
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _check_constraints_detail(
-        assignments: list[dict[str, Any]], instance: dict[str, Any]
-    ) -> dict[str, bool]:
-        """Return a dict of {constraint_name: passed} for each of the 4 categories."""
-        jobs_by_id = {j["id"]: j for j in instance.get("jobs", [])}
-        machines_by_id = {m["id"]: m for m in instance.get("machines", [])}
-        assign_by_job = {a["job_id"]: a for a in assignments}
-        # ---- (a) Capacity: concurrent jobs on any machine ≤ its capacity ----
-        machine_intervals: dict[str, list[tuple[float, float]]] = {}
-        for a in assignments:
-            mid = a["machine_id"]
-            st = float(a["start_time"])
-            dur = float(jobs_by_id.get(a["job_id"], {}).get("duration", 1))
-            machine_intervals.setdefault(mid, []).append((st, st + dur))
-        capacity_ok = True
-        for mid, intervals in machine_intervals.items():
-            cap = machines_by_id.get(mid, {}).get("capacity", 1)
-            for s1, e1 in intervals:
-                # Count how many intervals overlap with [s1, e1)
-                concurrent = sum(
-                    1 for s2, e2 in intervals if s2 < e1 and e2 > s1
-                )
-                if concurrent > cap:
-                    capacity_ok = False
-                    break
-            if not capacity_ok:
-                break
-        # ---- (b) Deadlines: every job finishes by its deadline ----
-        deadline_ok = True
-        for a in assignments:
-            job = jobs_by_id.get(a["job_id"], {})
-            finish = float(a["start_time"]) + float(job.get("duration", 0))
-            dl = job.get("deadline", float("inf"))
-            if finish > dl:
-                deadline_ok = False
-                break
-        # ---- (c) Precedence: job starts after ALL its predecessors finish ----
-        precedence_ok = True
-        for a in assignments:
-            job = jobs_by_id.get(a["job_id"], {})
-            for dep_id in job.get("dependencies", []):
-                dep_a = assign_by_job.get(dep_id)
-                if dep_a is None:
-                    precedence_ok = False
-                    break
-                dep_job = jobs_by_id.get(dep_id, {})
-                dep_finish = float(dep_a["start_time"]) + float(
-                    dep_job.get("duration", 0)
-                )
-                if float(a["start_time"]) < dep_finish:
-                    precedence_ok = False
-                    break
-            if not precedence_ok:
-                break
-        # ---- (d) Availability: job runs within machine availability window ----
-        availability_ok = True
-        for a in assignments:
-            machine = machines_by_id.get(a["machine_id"], {})
-            avail_start = float(machine.get("available_start", 0))
-            avail_end = float(machine.get("available_end", float("inf")))
-            job = jobs_by_id.get(a["job_id"], {})
-            job_start = float(a["start_time"])
-            job_end = job_start + float(job.get("duration", 0))
-            if job_start < avail_start or job_end > avail_end:
-                availability_ok = False
-                break
-        return {
-            "capacity": capacity_ok,
-            "deadlines": deadline_ok,
-            "precedence": precedence_ok,
-            "availability": availability_ok,
-        }
-    @staticmethod
-    def _check_constraints(
-        assignments: list[dict[str, Any]], instance: dict[str, Any]
-    ) -> float:
-        """Convenience wrapper — returns fraction of categories satisfied."""
-        detail = RepairGrader._check_constraints_detail(assignments, instance)
-        return sum(detail.values()) / max(len(detail), 1)
-    # ------------------------------------------------------------------
-    # Makespan calculation
-    # ------------------------------------------------------------------
-    @staticmethod
-    def _compute_makespan(
-        assignments: list[dict[str, Any]], instance: dict[str, Any]
-    ) -> int:
-        """Return the latest finish time across all assigned jobs."""
-        jobs_by_id = {j["id"]: j for j in instance.get("jobs", [])}
-        max_finish = 0
-        for a in assignments:
-            job = jobs_by_id.get(a["job_id"], {})
-            finish = int(a["start_time"]) + int(job.get("duration", 0))
-            if finish > max_finish:
-                max_finish = finish
-        return max_finish

 from __future__ import annotations
 from typing import Any
+# HARD MODE: Brutal adversarial penalties.
+_TP_WEIGHT = 1.0
+_TN_WEIGHT = 0.05 # Tiny reward for passing valid trades
+_FP_PENALTY = 0.4 # Heavy penalty for false alarms
+_FN_PENALTY = 0.8 # Massive penalty for missing adversarial trades
+class HardFixGrader:
+    """Grader for Task 3: Code Fix repurposed for HFT (Hard)."""
     def __init__(self) -> None:
         self.last_breakdown: dict[str, Any] = {}
+    def grade(self, state: Any, ground_truth: dict[str, Any] | None = None) -> float:
+        tp = float(getattr(state, "total_tp", 0))
+        tn = float(getattr(state, "total_tn", 0))
+        fp = float(getattr(state, "total_fp", 0))
+        fn = float(getattr(state, "total_fn", 0))
+        total = tp + tn + fp + fn
+        if total == 0:
+            return 0.01
+        positive_signal = (tp * _TP_WEIGHT) + (tn * _TN_WEIGHT)
+        negative_signal = (fp * _FP_PENALTY) + (fn * _FN_PENALTY)
+        max_signal = total * _TP_WEIGHT
+        raw_score = max(0.0, positive_signal - negative_signal) / max_signal
+        score = max(0.01, min(0.99, raw_score))
+        self.last_breakdown = {"tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn), "score": score}
+        return score

hf auditor/openenv_fin_auditor.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,19 @@

+Metadata-Version: 2.4
+Name: openenv-fin_auditor
+Version: 0.1.0
+Summary: Fin Auditor environment for OpenEnv
+Requires-Python: >=3.10
+Requires-Dist: gymnasium>=1.2.3
+Requires-Dist: openenv-core[core]>=0.2.2
+Requires-Dist: numpy
+Requires-Dist: nanobind
+Requires-Dist: openai
+Requires-Dist: pydantic
+Requires-Dist: fastapi
+Requires-Dist: uvicorn
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: python-dotenv>=1.2.2
+Requires-Dist: stable-baselines3[extra]>=2.8.0
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

hf auditor/openenv_fin_auditor.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+README.md
+build_engine.py
+final_check.py
+inference.py
+models.py
+pyproject.toml
+test_import.py
+train.py
+./build_engine.py
+./final_check.py
+./inference.py
+./models.py
+./test_import.py
+./train.py
+openenv_fin_auditor.egg-info/PKG-INFO
+openenv_fin_auditor.egg-info/SOURCES.txt
+openenv_fin_auditor.egg-info/dependency_links.txt
+openenv_fin_auditor.egg-info/entry_points.txt
+openenv_fin_auditor.egg-info/requires.txt
+openenv_fin_auditor.egg-info/top_level.txt
+server/__init__.py
+server/app.py
+server/fin_auditor_environment.py

hf auditor/openenv_fin_auditor.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

hf auditor/openenv_fin_auditor.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ server = fin_auditor.server.app:main

hf auditor/openenv_fin_auditor.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gymnasium>=1.2.3
+openenv-core[core]>=0.2.2
+numpy
+nanobind
+openai
+pydantic
+fastapi
+uvicorn
+pandas>=2.3.3
+python-dotenv>=1.2.2
+stable-baselines3[extra]>=2.8.0
+[dev]
+pytest>=8.0.0
+pytest-cov>=4.0.0

hf auditor/openenv_fin_auditor.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ fin_auditor

server/app.py CHANGED Viewed

@@ -25,10 +25,13 @@ if _ROOT_DIR not in sys.path:
 if _CURRENT_DIR not in sys.path:
     sys.path.insert(0, _CURRENT_DIR)
 try:
-    from fin_auditor_environment import FinAuditorEnvironment, hft_auditor
     from models import AuditorAction, AuditorObservation
-    from graders.grader_detection import FinAuditorGrader
     from tasks import task1_easy, task2_medium, task3_hard
     HAS_ENV = True
@@ -47,32 +50,40 @@ except ImportError as e:
 # ==============================================================================
 if HAS_ENV and NATIVE_VERIFIED:
-    # 1. Create a single tracked instance so your custom dashboard can read live metrics
-    global active_env_instance
     active_env_instance = FinAuditorEnvironment()
-    # 2. Register environment, action/obs models, and pass the RAW TASK MODULES
-    try:
-        app = create_app(
-            lambda: active_env_instance,  # This preserves your custom UI dashboard!
-            AuditorAction,
-            AuditorObservation,
-            tasks=[
-                task1_easy,
-                task2_medium,
-                task3_hard
-            ]
-        )
-    except TypeError as e:
-        print(f"[CRITICAL] OpenEnv framework version mismatch: {e}")
-        # Fallback if the local OpenEnv version doesn't support tasks kwargs
-        app = create_app(lambda: active_env_instance, AuditorAction, AuditorObservation)
 else:
     # Fallback for local development without the C++ binary
     app = FastAPI(title="PayGorn (MOCK MODE)")
     @app.post("/reset")
-    async def mock_reset(): return {"reward": 0.0}
     @app.post("/step")
     async def mock_step(action: dict): return {"reward": 0.5, "done": False, "step_count": 0}
@@ -230,7 +241,48 @@ async def get_dashboard_action(req: ActionRequest):
         decisions = await execute_llm_step(api_key, base_url, model_name, batch_size)
     else:
         decisions = [random.choice([0, 1]) for _ in range(batch_size)]
-    return {"decisions": decisions}
 @app.post("/config/llm")
 async def config_llm(cfg: LLMConfig):
@@ -704,10 +756,10 @@ async def root_dashboard():
     async function executeReset() {
         logMsg("SPSC_BUFFER_FLUSHING...", "warn");
         try {
-            await fetch('/reset', {method: 'POST'});
-            ledgerBody.innerHTML = '';
             updateState();
-            logMsg("Memory pool purged.", "success");
         } catch(e) {
             logMsg("Reset failed: " + e.message, "err");
         }
@@ -721,48 +773,33 @@ async def root_dashboard():
                 headers: {'Content-Type': 'application/json'},
                 body: JSON.stringify({action_type: actionType})
             });
             if (!actionRes.ok) {
                 const errData = await actionRes.json();
                 logMsg("LLM Error: " + (errData.detail || "Failed to generate decisions"), "err");
                 return;
             }
             const actionData = await actionRes.json();
-            if(!actionData.decisions) {
-                logMsg("Decision matrix generation failed.", "err"); return;
-            }
             logMsg(`Executing Step with ${actionData.decisions.length} decisions...`, "info");
-            const res = await fetch('/step', {
                 method: 'POST',
                 headers: {'Content-Type': 'application/json'},
-                body: JSON.stringify({ action: actionData })
             });
             if (!res.ok) {
                 const errorData = await res.json();
-                console.error("Validation Details:", errorData);
-                logMsg(`Server Error: ${res.status}. Check browser console.`, "err");
                 return;
             }
             const data = await res.json();
-            // FIX: Robust payload extraction handling regardless of OpenEnv wrapper depth
-            const reward = data.reward ?? data.observation?.reward ?? data.info?.reward ?? 0.0;
-            const done = data.done ?? data.observation?.done ?? data.info?.done ?? false;
-            // Fetch the authoritative step count from /dashboard/state
-            let step = 'N/A';
-            try {
-                const stateRes = await fetch('/dashboard/state');
-                const stateData = await stateRes.json();
-                step = stateData.step_count ?? 'N/A';
-            } catch(se) {}  // Swallow — non-critical
-            logMsg(`[RECON] Reward: ${reward.toFixed(4)} | Success`, reward >= 0.8 ? 'success' : 'warn');
             const row = document.createElement('tr');
             row.innerHTML = `
@@ -774,13 +811,13 @@ async def root_dashboard():
             `;
             if(ledgerBody.children.length >= 5) { ledgerBody.removeChild(ledgerBody.firstChild); }
             ledgerBody.appendChild(row);
             updateState();
         } catch(e) {
             logMsg("Step Execution Error: " + e.message, "err");
         }
     }
     // Auto-Reset the environment on boot so it actually has data to process,
     // then try to authenticate with the default HF_TOKEN
     window.addEventListener('DOMContentLoaded', async () => {

 if _CURRENT_DIR not in sys.path:
     sys.path.insert(0, _CURRENT_DIR)
+# Always define this as a safe global so /dashboard/state never throws NameError
+# even when NATIVE_VERIFIED is False (C++ binary missing).
+active_env_instance = None
 try:
+    from server.fin_auditor_environment import FinAuditorEnvironment, hft_auditor
     from models import AuditorAction, AuditorObservation
     from tasks import task1_easy, task2_medium, task3_hard
     HAS_ENV = True
 # ==============================================================================
 if HAS_ENV and NATIVE_VERIFIED:
+    # ── Dashboard singleton ────────────────────────────────────────────────────
+    # Used ONLY by /dashboard/* endpoints and /ws/telemetry for live telemetry.
+    # NEVER passed to create_app — OpenEnv gets a factory that produces isolated
+    # instances per session so close() cannot corrupt the dashboard engine.
     active_env_instance = FinAuditorEnvironment()
+    # Pre-load the first batch so the dashboard has real data immediately.
+    # (reset() calls generate_batch internally, so just call reset here.)
+    active_env_instance.reset()
+    # ── OpenEnv factory ────────────────────────────────────────────────────────
+    # CRITICAL: OpenEnv's WebSocket server creates ONE env per session via
+    # env_factory() and then sends reset + step messages to the SAME instance.
+    # This means reset() IS called before step(), so the C++ engine has data.
+    # For HTTP mode (stateless), each request gets its own env — step() is called
+    # on a cold engine, but our __init__ initialises counters to 0 so it won't
+    # crash; it will just return the floor reward of 0.01 (acceptable for Phase 2).
+    def env_factory() -> FinAuditorEnvironment:
+        """Create a fresh, self-contained FinAuditorEnvironment per OpenEnv session."""
+        return FinAuditorEnvironment()
+    # NOTE: create_app() has no `tasks=` parameter in openenv-core >= 0.2.x.
+    # Task routing (easy/medium/hard difficulty) is handled inside reset() via
+    # the task_id kwarg that Phase 2 injects into the reset message body.
+    app = create_app(
+        env_factory,
+        AuditorAction,
+        AuditorObservation,
+    )
 else:
     # Fallback for local development without the C++ binary
     app = FastAPI(title="PayGorn (MOCK MODE)")
     @app.post("/reset")
+    async def mock_reset(): return {"reward": 0.01}
     @app.post("/step")
     async def mock_step(action: dict): return {"reward": 0.5, "done": False, "step_count": 0}
         decisions = await execute_llm_step(api_key, base_url, model_name, batch_size)
     else:
         decisions = [random.choice([0, 1]) for _ in range(batch_size)]
+    return {"decisions": decisions}
+class DashboardStepRequest(BaseModel):
+    """Action payload for the dashboard-native step endpoint."""
+    decisions: List[int]
+@app.post("/dashboard/step")
+async def dashboard_step(req: DashboardStepRequest):
+    """
+    Dashboard-native step: runs on the SINGLETON engine (warm, with real trade data),
+    not on the OpenEnv /step route which creates a cold engine per request.
+    This is what the three dashboard buttons (OPTIMAL / STRESS / LLM) call so that
+    rewards reflect actual confusion-matrix scoring rather than the 0.01 floor.
+    """
+    if not active_env_instance or not NATIVE_VERIFIED:
+        raise HTTPException(status_code=503, detail="Native engine not available")
+    from models import AuditorAction
+    action = AuditorAction(decisions=req.decisions)
+    obs = active_env_instance.step(action)
+    return {
+        "reward": obs.reward,
+        "done": obs.done,
+        "step_count": active_env_instance.state.step_count,
+        "features_shape": [len(obs.features), len(obs.features[0]) if obs.features else 0],
+    }
+@app.post("/dashboard/reset")
+async def dashboard_reset():
+    """
+    Reset the dashboard singleton: re-seeds the ring buffer with fresh trade data.
+    Called by the [FLUSH_SPSC_BUFFER] button in the dashboard JS.
+    """
+    if not active_env_instance or not NATIVE_VERIFIED:
+        raise HTTPException(status_code=503, detail="Native engine not available")
+    active_env_instance.reset()
+    return {"status": "ok", "step_count": active_env_instance.state.step_count}
 @app.post("/config/llm")
 async def config_llm(cfg: LLMConfig):
     async function executeReset() {
         logMsg("SPSC_BUFFER_FLUSHING...", "warn");
         try {
+            await fetch('/dashboard/reset', {method: 'POST'});
+            ledgerBody.innerHTML = '';
             updateState();
+            logMsg("Memory pool purged and re-seeded.", "success");
         } catch(e) {
             logMsg("Reset failed: " + e.message, "err");
         }
                 headers: {'Content-Type': 'application/json'},
                 body: JSON.stringify({action_type: actionType})
             });
             if (!actionRes.ok) {
                 const errData = await actionRes.json();
                 logMsg("LLM Error: " + (errData.detail || "Failed to generate decisions"), "err");
                 return;
             }
             const actionData = await actionRes.json();
+            if(!actionData.decisions) { logMsg("Decision matrix generation failed.", "err"); return; }
             logMsg(`Executing Step with ${actionData.decisions.length} decisions...`, "info");
+            // POST to /dashboard/step (warm singleton) NOT /step (cold factory engine)
+            const res = await fetch('/dashboard/step', {
                 method: 'POST',
                 headers: {'Content-Type': 'application/json'},
+                body: JSON.stringify({ decisions: actionData.decisions })
             });
             if (!res.ok) {
                 const errorData = await res.json();
+                logMsg(`Server Error: ${res.status} — ${errorData.detail || 'check logs'}`, "err");
                 return;
             }
             const data = await res.json();
+            const reward = data.reward ?? 0.0;
+            const done   = data.done   ?? false;
+            const step   = data.step_count ?? 'N/A';
+            logMsg(`[RECON] Reward: ${reward.toFixed(4)} | Step: ${step}`, reward >= 0.8 ? 'success' : 'warn');
             const row = document.createElement('tr');
             row.innerHTML = `
             `;
             if(ledgerBody.children.length >= 5) { ledgerBody.removeChild(ledgerBody.firstChild); }
             ledgerBody.appendChild(row);
             updateState();
         } catch(e) {
             logMsg("Step Execution Error: " + e.message, "err");
         }
     }
     // Auto-Reset the environment on boot so it actually has data to process,
     // then try to authenticate with the default HF_TOKEN
     window.addEventListener('DOMContentLoaded', async () => {

server/fin_auditor_environment.py CHANGED Viewed

@@ -81,8 +81,30 @@ class FinAuditorEnvironment(Environment):
         self._state = State(episode_id=str(uuid4()), step_count=0)
         self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
         self.sim_time_ns = 0
-        task_id = os.getenv("TASK_ID", "anomaly_detection_hard").lower()
         if "easy" in task_id:
             self.difficulty = hft_auditor.Difficulty.EASY
@@ -93,9 +115,6 @@ class FinAuditorEnvironment(Environment):
         else:
             self.difficulty = hft_auditor.Difficulty.HARD
             self._MAX_EPISODE_STEPS = 20
-    def reset(self) -> AuditorObservation:
-        self._state = State(episode_id=str(uuid4()), step_count=0)
         # 1. Initialize Cumulative Counters for the Grader
         self._state.total_tp = 0
@@ -175,6 +194,14 @@ class FinAuditorEnvironment(Environment):
             done=done
         )
     @property
     def state(self) -> State:

         self._state = State(episode_id=str(uuid4()), step_count=0)
         self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
         self.sim_time_ns = 0
+        # We default to HARD, but the actual routing happens in reset()
+        self.difficulty = hft_auditor.Difficulty.HARD
+        self._MAX_EPISODE_STEPS = 20
+        # Initialize confusion-matrix counters here so they always exist on
+        # the State object — even when step() is called on a fresh env that
+        # has not yet had reset() called (OpenEnv HTTP stateless mode creates
+        # a new env per request, so step_handler calls step() directly).
+        self._state.total_tp = 0
+        self._state.total_tn = 0
+        self._state.total_fp = 0
+        self._state.total_fn = 0
+        self._state.last_tp = 0
+        self._state.last_tn = 0
+        self._state.last_fp = 0
+        self._state.last_fn = 0
+    # FIX 1: Add *args, **kwargs to prevent TypeError when OpenEnv injects task_id
+    def reset(self, *args, **kwargs) -> AuditorObservation:
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        # FIX 2: Dynamically shift difficulty based on OpenEnv's requested task
+        task_id = kwargs.get("task_id", os.getenv("TASK_ID", "anomaly_detection_hard")).lower()
         if "easy" in task_id:
             self.difficulty = hft_auditor.Difficulty.EASY
         else:
             self.difficulty = hft_auditor.Difficulty.HARD
             self._MAX_EPISODE_STEPS = 20
         # 1. Initialize Cumulative Counters for the Grader
         self._state.total_tp = 0
             done=done
         )
+    def close(self) -> None:
+        """No-op: called by OpenEnv HTTP server after every request.
+        With the factory pattern each request gets a *fresh* instance, so
+        there is nothing to explicitly clean up here — the C++ engine is
+        reference-counted and will be released when the Python object is GC'd.
+        """
+        pass
     @property
     def state(self) -> State:

tasks/task1_easy.py CHANGED Viewed

@@ -7,13 +7,15 @@ _ROOT = os.path.abspath(os.path.join(_HERE, ".."))
 if _ROOT not in sys.path:
     sys.path.insert(0, _ROOT)
-from graders.grader_detection import FinAuditorGrader
 TASK_ID = "anomaly_detection_easy"
 MAX_STEPS = 5
 DIFFICULTY = "easy"
-grader = FinAuditorGrader()
 def get_task_config() -> dict:
     return {

 if _ROOT not in sys.path:
     sys.path.insert(0, _ROOT)
+# 1. IMPORT THE EASY GRADER FROM THE DETECTION FILE
+from graders.grader_detection import EasyDetectionGrader
 TASK_ID = "anomaly_detection_easy"
 MAX_STEPS = 5
 DIFFICULTY = "easy"
+# 2. INSTANTIATE THE EASY GRADER
+grader = EasyDetectionGrader()
 def get_task_config() -> dict:
     return {

tasks/task2_medium.py CHANGED Viewed

@@ -7,13 +7,15 @@ _ROOT = os.path.abspath(os.path.join(_HERE, ".."))
 if _ROOT not in sys.path:
     sys.path.insert(0, _ROOT)
-from graders.grader_detection import FinAuditorGrader
 TASK_ID = "anomaly_detection_medium"
 MAX_STEPS = 10
 DIFFICULTY = "medium"
-grader = FinAuditorGrader()
 def get_task_config() -> dict:
     return {

 if _ROOT not in sys.path:
     sys.path.insert(0, _ROOT)
+# 1. IMPORT THE MEDIUM GRADER FROM THE CLASSIFICATION FILE
+from graders.grader_classification import MediumClassificationGrader
 TASK_ID = "anomaly_detection_medium"
 MAX_STEPS = 10
 DIFFICULTY = "medium"
+# 2. INSTANTIATE THE MEDIUM GRADER
+grader = MediumClassificationGrader()
 def get_task_config() -> dict:
     return {

tasks/task3_hard.py CHANGED Viewed

@@ -7,13 +7,15 @@ _ROOT = os.path.abspath(os.path.join(_HERE, ".."))
 if _ROOT not in sys.path:
     sys.path.insert(0, _ROOT)
-from graders.grader_detection import FinAuditorGrader
 TASK_ID = "anomaly_detection_hard"
 MAX_STEPS = 20
 DIFFICULTY = "hard"
-grader = FinAuditorGrader()
 def get_task_config() -> dict:
     return {

 if _ROOT not in sys.path:
     sys.path.insert(0, _ROOT)
+# 1. IMPORT THE HARD GRADER FROM THE FIX FILE
+from graders.grader_fix import HardFixGrader
 TASK_ID = "anomaly_detection_hard"
 MAX_STEPS = 20
 DIFFICULTY = "hard"
+# 2. INSTANTIATE THE HARD GRADER
+grader = HardFixGrader()
 def get_task_config() -> dict:
     return {