Spaces:

TwoBraincells
/

Elite-Trade-Sentry

Sleeping

App Files Files Community

SamaKool commited on Apr 10

Commit

85228ff

1 Parent(s): 3385186

fix: corrected C++ memory clearance, enforced regex compliance, and bootstrapped RL state

Browse files

Files changed (5) hide show

final_check.py +73 -25
hf auditor/src/reconciliation_engine.hpp +12 -3
inference.py +44 -44
server/app.py +41 -2
server/fin_auditor_environment.py +15 -5

final_check.py CHANGED Viewed

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import os
 import sys
 import json
 import yaml
 import unittest
@@ -29,25 +30,25 @@ class FinalIntegrityCheck(unittest.TestCase):
         print("\n[TEST 1] LLM Parser Robustness...")
         # Test Case A: Markdown wrapped JSON
-        dirty_json = "Here is the result:\n```json\n{\"decisions\": [0, 1, 2]}\n```\nHope this helps!"
         res = inference._parse_llm_decisions(dirty_json, 3)
-        self.assertEqual(res, [0, 1, 2], "Failed to parse markdown-wrapped JSON")
         # Test Case B: Extra text before JSON
-        extra_text = "The decisions are as follows: {\"decisions\": [1, 2]}"
         res = inference._parse_llm_decisions(extra_text, 2)
-        self.assertEqual(res, [1, 2], "Failed to parse JSON with leading text")
-        # Test Case C: Malformed JSON -> should trigger 'Flag All' (2) fallback
         malformed = "{\"decisions\": [0, 1, " # Missing closing bracket
         res = inference._parse_llm_decisions(malformed, 4)
-        self.assertEqual(res, [2, 2, 2, 2], "Failed to trigger fallback on malformed JSON")
         # Test Case D: Correct length normalization
         wrong_len = "{\"decisions\": [1]}"
         res = inference._parse_llm_decisions(wrong_len, 3)
         self.assertEqual(len(res), 3, "Failed to normalize decision list length")
-        self.assertEqual(res, [1, 2, 2], "Failed to pad short decision list with 2s")
         print("✓ LLM Parser logic is robust.")
@@ -71,30 +72,59 @@ class FinalIntegrityCheck(unittest.TestCase):
         print(f"✓ Spec matches. Found {len(tasks)} tasks.")
     def test_3_reward_boundary(self):
-        """Verify environment rewards stay strictly within [0.0, 1.0]"""
         print("\n[TEST 3] Reward Boundary Check...")
         env = FinAuditorEnvironment()
         obs = env.reset()
         # Simulate a step with some decisions
-        action = AuditorAction(decisions=[2] * len(obs.features))
         new_obs = env.step(action)
         reward = new_obs.reward
         self.assertIsNotNone(reward)
-        self.assertGreaterEqual(reward, 0.0, f"Reward {reward} < 0.0")
-        self.assertLessEqual(reward, 1.0, f"Reward {reward} > 1.0")
         print(f"✓ Reward boundary is safe: {reward}")
-    def test_4_integration_dry_run(self):
-        """Run a 2-step inference using a mocked OpenAI client"""
-        print("\n[TEST 4] Integration Dry Run...")
         # Mock the OpenAI client response
         mock_response = MagicMock()
         mock_response.choices = [MagicMock()]
-        mock_response.choices[0].message.content = json.dumps({"decisions": [2] * 200}) # provide plenty
         with patch("inference._client.chat.completions.create", return_value=mock_response):
             f = io.StringIO()
@@ -102,18 +132,36 @@ class FinalIntegrityCheck(unittest.TestCase):
                 inference.run_inference()
             output = f.getvalue()
-            # Verify structured logs appear
-            self.assertIn("[START]", output)
-            self.assertIn("[STEP]  step=1", output)
-            self.assertIn("[STEP]  step=2", output)
-            self.assertIn("[END]", output)
-            # Check if rewards were logs
-            self.assertIn("reward=", output)
-            self.assertIn("cumulative_reward=", output)
-        print("✓ Integration dry run successful. Logs are correctly formatted.")
 if __name__ == "__main__":
     unittest.main(verbosity=1)

 #!/usr/bin/env python3
 import os
 import sys
+import re
 import json
 import yaml
 import unittest
         print("\n[TEST 1] LLM Parser Robustness...")
         # Test Case A: Markdown wrapped JSON
+        dirty_json = "Here is the result:\n```json\n{\"decisions\": [0, 1, 1]}\n```\nHope this helps!"
         res = inference._parse_llm_decisions(dirty_json, 3)
+        self.assertEqual(res, [0, 1, 1], "Failed to parse markdown-wrapped JSON")
         # Test Case B: Extra text before JSON
+        extra_text = "The decisions are as follows: {\"decisions\": [1, 1]}"
         res = inference._parse_llm_decisions(extra_text, 2)
+        self.assertEqual(res, [1, 1], "Failed to parse JSON with leading text")
+        # Test Case C: Malformed JSON -> should trigger 'Flag All' (1) fallback
         malformed = "{\"decisions\": [0, 1, " # Missing closing bracket
         res = inference._parse_llm_decisions(malformed, 4)
+        self.assertEqual(res, [1, 1, 1, 1], "Failed to trigger fallback on malformed JSON")
         # Test Case D: Correct length normalization
         wrong_len = "{\"decisions\": [1]}"
         res = inference._parse_llm_decisions(wrong_len, 3)
         self.assertEqual(len(res), 3, "Failed to normalize decision list length")
+        self.assertEqual(res, [1, 1, 1], "Failed to pad short decision list with 1s")
         print("✓ LLM Parser logic is robust.")
         print(f"✓ Spec matches. Found {len(tasks)} tasks.")
     def test_3_reward_boundary(self):
+        """Verify environment rewards stay strictly within (0.0, 1.0)"""
         print("\n[TEST 3] Reward Boundary Check...")
         env = FinAuditorEnvironment()
         obs = env.reset()
+        # Reset should return features now (not empty)
+        self.assertGreater(len(obs.features), 0, "Reset should return features for step 1")
         # Simulate a step with some decisions
+        action = AuditorAction(decisions=[1] * len(obs.features))
         new_obs = env.step(action)
         reward = new_obs.reward
         self.assertIsNotNone(reward)
+        self.assertGreater(reward, 0.0, f"Reward {reward} must be > 0.0 (not exact boundary)")
+        self.assertLess(reward, 1.0, f"Reward {reward} must be < 1.0 (not exact boundary)")
         print(f"✓ Reward boundary is safe: {reward}")
+    def test_4_reward_varies_by_action(self):
+        """Verify rewards differ between optimal and random agents"""
+        print("\n[TEST 4] Reward Variation Check...")
+        # Run with all-1 decisions (flag everything)
+        env1 = FinAuditorEnvironment()
+        obs1 = env1.reset()
+        action1 = AuditorAction(decisions=[1] * len(obs1.features))
+        result1 = env1.step(action1)
+        reward1 = result1.reward
+        # Run with all-0 decisions (pass everything)
+        env2 = FinAuditorEnvironment()
+        obs2 = env2.reset()
+        action2 = AuditorAction(decisions=[0] * len(obs2.features))
+        result2 = env2.step(action2)
+        reward2 = result2.reward
+        print(f"  All-flag reward: {reward1:.4f}")
+        print(f"  All-pass reward: {reward2:.4f}")
+        # In EASY mode (100% anomalies), flagging everything should score higher
+        self.assertNotEqual(reward1, reward2, "Rewards must differ between flag-all and pass-all")
+        print("✓ Rewards vary based on agent decisions.")
+    def test_5_stdout_format(self):
+        """Run a 2-step inference and verify stdout matches hackathon regex"""
+        print("\n[TEST 5] Stdout Format Compliance...")
         # Mock the OpenAI client response
         mock_response = MagicMock()
         mock_response.choices = [MagicMock()]
+        mock_response.choices[0].message.content = json.dumps({"reasoning": "test", "decisions": [1] * 200})
         with patch("inference._client.chat.completions.create", return_value=mock_response):
             f = io.StringIO()
                 inference.run_inference()
             output = f.getvalue()
+            lines = [l for l in output.strip().split("\n") if l.strip()]
+            # Verify START tag format
+            start_line = lines[0]
+            start_match = re.match(r'^\[START\] task=\S+ env=\S+ model=\S+$', start_line)
+            self.assertIsNotNone(start_match, f"START line doesn't match regex: {start_line}")
+            # Verify STEP tag format
+            step_lines = [l for l in lines if l.startswith("[STEP]")]
+            self.assertTrue(len(step_lines) >= 1, "No STEP lines found")
+            for sl in step_lines:
+                step_match = re.match(
+                    r'^\[STEP\] step=\d+ action=\S+ reward=\d+\.\d{2} done=(true|false) error=\S+$',
+                    sl
+                )
+                self.assertIsNotNone(step_match, f"STEP line doesn't match regex: {sl}")
+            # Verify END tag format
+            end_line = lines[-1]
+            end_match = re.match(
+                r'^\[END\] success=(true|false) steps=\d+ rewards=[\d.,]+$',
+                end_line
+            )
+            self.assertIsNotNone(end_match, f"END line doesn't match regex: {end_line}")
+            # Verify NO JSON on stdout
+            self.assertNotIn("{", output, "Stdout must not contain JSON braces")
+            self.assertNotIn("}", output, "Stdout must not contain JSON braces")
+        print("✓ Stdout format is compliant with hackathon regex rules.")
 if __name__ == "__main__":
     unittest.main(verbosity=1)

hf auditor/src/reconciliation_engine.hpp CHANGED Viewed

@@ -395,9 +395,9 @@ public:
       out[2] = missing_freq;
       out[3] = static_cast<float>(slot.counterparty_id % 100) / 100.0f;
-      // CRITICAL: Mark slot as EMPTY after reporting to Python so it
-      // doesn't reappear in the next processing step.
-      pool_.set_state(idx, SlotState::EMPTY);
       ++row;
     }
@@ -465,6 +465,15 @@ public:
       total_reward += r;
     }
     return total_reward;
   }

       out[2] = missing_freq;
       out[3] = static_cast<float>(slot.counterparty_id % 100) / 100.0f;
+      // NOTE: Do NOT clear the slot here. The ground truth label must
+      // remain readable until compute_reward() processes the agent's
+      // decisions. Clearing happens inside compute_reward() instead.
       ++row;
     }
       total_reward += r;
     }
+    // Now that rewards are computed, clear the expired slots so they
+    // don't reappear in the next get_anomaly_matrix() call.
+    for (size_t i = 0; i < num_actions; ++i) {
+      const uint32_t idx = expired_buffer_[i];
+      if (pool_.get_state(idx) == SlotState::EXPIRED) {
+        pool_.set_state(idx, SlotState::EMPTY);
+      }
+    }
     return total_reward;
   }

inference.py CHANGED Viewed

@@ -1,10 +1,19 @@
 #!/usr/bin/env python3
 import os
 import sys
 import json
 import re
-import datetime
 import traceback
 import time
 from typing import List
@@ -27,6 +36,10 @@ except ImportError:
 from models import AuditorAction
 class LLMResponse(BaseModel):
     reasoning: str
     decisions: List[int]
@@ -39,6 +52,7 @@ if not HF_TOKEN:
     raise ValueError("CRITICAL: HF_TOKEN environment variable is missing.")
 TASK_ID:      str = os.getenv("TASK_ID", "anomaly_detection_hard")
 # FIX: Sync the inference max_steps default with the active task
 if "easy" in TASK_ID.lower():
@@ -72,9 +86,6 @@ Example:
 {"reasoning": "Trade 1 has high risk. Trade 2 is safe.", "decisions": [1, 0, 1]}
 """
-def _ts() -> str:
-    return datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
 def _build_user_prompt(step: int, features: list[list[float]]) -> str:
     lines = [
         f"Step {step}: You have {len(features)} flagged trades to audit.",
@@ -154,8 +165,10 @@ def _call_llm(step: int, features: list[list[float]]) -> list[int]:
             content = response.choices[0].message.content or ""
             return _parse_llm_decisions(content, len(features))
         except Exception as e:
             time.sleep(1)
     fallback_decisions = []
     for row in features:
         if len(row) >= 4:
@@ -166,72 +179,59 @@ def _call_llm(step: int, features: list[list[float]]) -> list[int]:
     return fallback_decisions
 def run_inference() -> None:
-    episode_id: str = "unknown"
-    total_reward: float = 0.0
     steps_completed: int = 0
-    status: str = "SUCCESS"
     try:
         env = FinAuditorEnvironment()
         obs = env.reset()
-        episode_id = getattr(env.state, 'episode_id', "test_run")
-        start_payload = {
-            "episode_id": episode_id,
-            "model": MODEL_NAME,
-            "difficulty": TASK_ID,
-            "max_steps": MAX_STEPS
-        }
-        print(f"[START] {json.dumps(start_payload)}", flush=True)
         for step_num in range(1, MAX_STEPS + 1):
-            step_reward = 0.0
             features = obs.features
             if not features:
                 action = AuditorAction(decisions=[])
-                _last_reasoning = "Empty matrix."
             else:
                 decisions = _call_llm(step_num, features)
                 action = AuditorAction(decisions=decisions)
             obs = env.step(action)
             step_reward = obs.reward if obs.reward is not None else 0.0
-            total_reward += step_reward
             steps_completed = step_num
-            # FIX: Ensure fractional precision is retained for validation
-            step_payload = {
-                "step": step_num,
-                "anomalies": len(features),
-                "reward": round(float(step_reward), 4),
-                "cumulative_reward": round(float(total_reward), 4),
-                "done": bool(obs.done),
-                "error": None,
-                "reasoning": _last_reasoning[:120].replace('\n', ' ') + "...",
-                "tp": getattr(env.state, 'last_tp', 0),
-                "tn": getattr(env.state, 'last_tn', 0),
-                "fp": getattr(env.state, 'last_fp', 0),
-                "fn": getattr(env.state, 'last_fn', 0)
-            }
-            print(f"[STEP] {json.dumps(step_payload)}", flush=True)
             if obs.done:
                 break
     except KeyboardInterrupt:
-        status = "INTERRUPTED"
     except Exception as exc:
-        status = "ERROR"
-        traceback.print_exc(file=sys.stderr)
-    avg_reward = total_reward / max(steps_completed, 1)
-    end_payload = {
-        "total_reward": round(float(total_reward), 4),
-        "avg_reward": round(float(avg_reward), 4),
-        "status": status
-    }
-    print(f"[END] {json.dumps(end_payload)}", flush=True)
 if __name__ == "__main__":
     run_inference()

 #!/usr/bin/env python3
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+# inference.py — OpenEnv Evaluation Script (Hackathon Submission)
+#
+# STDOUT FORMAT (strict regex compliance):
+#   [START] task=<task_name> env=<benchmark> model=<model_name>
+#   [STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
+#   [END] success=<true|false> steps=<n> rewards=<r1,r2,...,rn>
+#
+# ALL debug output goes to stderr. NO JSON on stdout. NO extra whitespace.
+# ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
 import os
 import sys
 import json
 import re
 import traceback
 import time
 from typing import List
 from models import AuditorAction
+# ── Debug logger: ONLY to stderr ─────────────────────────────────────────────
+def _dbg(msg: str) -> None:
+    print(msg, file=sys.stderr, flush=True)
 class LLMResponse(BaseModel):
     reasoning: str
     decisions: List[int]
     raise ValueError("CRITICAL: HF_TOKEN environment variable is missing.")
 TASK_ID:      str = os.getenv("TASK_ID", "anomaly_detection_hard")
+ENV_NAME:     str = "fin_auditor"
 # FIX: Sync the inference max_steps default with the active task
 if "easy" in TASK_ID.lower():
 {"reasoning": "Trade 1 has high risk. Trade 2 is safe.", "decisions": [1, 0, 1]}
 """
 def _build_user_prompt(step: int, features: list[list[float]]) -> str:
     lines = [
         f"Step {step}: You have {len(features)} flagged trades to audit.",
             content = response.choices[0].message.content or ""
             return _parse_llm_decisions(content, len(features))
         except Exception as e:
+            _dbg(f"[LLM RETRY {attempt+1}/{max_retries}] {e}")
             time.sleep(1)
+    _dbg("[LLM] All retries exhausted, using risk_score fallback")
     fallback_decisions = []
     for row in features:
         if len(row) >= 4:
     return fallback_decisions
 def run_inference() -> None:
     steps_completed: int = 0
+    all_rewards: list[float] = []
+    success: bool = False
+    error_msg: str | None = None
+    # ── [START] — always emitted ──────────────────────────────────────────
+    print(f"[START] task={TASK_ID} env={ENV_NAME} model={MODEL_NAME}", flush=True)
     try:
         env = FinAuditorEnvironment()
         obs = env.reset()
+        _dbg(f"[DBG] Episode started. Features: {len(obs.features)} rows, difficulty: {TASK_ID}")
         for step_num in range(1, MAX_STEPS + 1):
+            step_reward = 0.0
             features = obs.features
             if not features:
                 action = AuditorAction(decisions=[])
+                _dbg(f"[DBG] Step {step_num}: Empty feature matrix")
             else:
                 decisions = _call_llm(step_num, features)
                 action = AuditorAction(decisions=decisions)
             obs = env.step(action)
             step_reward = obs.reward if obs.reward is not None else 0.0
+            all_rewards.append(step_reward)
             steps_completed = step_num
+            # ── [STEP] — plain text, 2 decimal places, lowercase bools ────
+            action_str = ",".join(str(d) for d in action.decisions) if action.decisions else "none"
+            done_str = "true" if obs.done else "false"
+            print(f"[STEP] step={step_num} action={action_str} reward={step_reward:.2f} done={done_str} error=null", flush=True)
+            _dbg(f"[DBG] Step {step_num}: reward={step_reward:.4f}, features={len(obs.features)}, done={obs.done}")
             if obs.done:
                 break
+        success = True
     except KeyboardInterrupt:
+        error_msg = "interrupted"
+        _dbg("[DBG] Interrupted by user")
     except Exception as exc:
+        error_msg = str(exc).replace("\n", " ")[:80]
+        _dbg(f"[ERROR] {traceback.format_exc()}")
+    finally:
+        # ── [END] — ALWAYS emitted, even on crash ────────────────────────
+        success_str = "true" if success else "false"
+        rewards_str = ",".join(f"{r:.2f}" for r in all_rewards) if all_rewards else "0.00"
+        print(f"[END] success={success_str} steps={steps_completed} rewards={rewards_str}", flush=True)
 if __name__ == "__main__":
     run_inference()

server/app.py CHANGED Viewed

@@ -66,6 +66,35 @@ else:
 app_metrics = {"last_step_latency_us": 0.0}
 @app.middleware("http")
 async def capture_step_latency(request: Request, call_next):
     if request.url.path == "/step":
@@ -711,7 +740,14 @@ async def root_dashboard():
             // FIX: Robust payload extraction handling regardless of OpenEnv wrapper depth
             const reward = data.reward ?? data.observation?.reward ?? data.info?.reward ?? 0.0;
             const done = data.done ?? data.observation?.done ?? data.info?.done ?? false;
-            const step = data.step_count ?? data.observation?.step_count ?? data.info?.step_count ?? data.observation?.metadata?.step_count ?? 'N/A';
             logMsg(`[RECON] Reward: ${reward.toFixed(4)} | Success`, reward >= 0.8 ? 'success' : 'warn');
@@ -732,10 +768,13 @@ async def root_dashboard():
         }
     }
-    // FIX: Auto-Reset the environment on boot so it actually has data to process
     window.addEventListener('DOMContentLoaded', async () => {
         logMsg("Auto-initializing environment engine...", "info");
         await executeReset();
         setInterval(updateState, 1000);
         updateState();
     });

 app_metrics = {"last_step_latency_us": 0.0}
+# ── Auto-bootstrap on startup ────────────────────────────────────────────────
+@app.on_event("startup")
+async def auto_bootstrap():
+    """Auto-authenticate with HF_TOKEN and initialize engine on boot."""
+    token = os.getenv("HF_TOKEN", "")
+    if token:
+        try:
+            from openai import AsyncOpenAI
+            client = AsyncOpenAI(base_url="https://router.huggingface.co/v1", api_key=token, max_retries=2)
+            try:
+                response = await client.models.list()
+                model_list = [m.id for m in response.data]
+            except Exception:
+                model_list = ["meta-llama/Meta-Llama-3-8B-Instruct"]
+            llm_session["api_key"] = token
+            llm_session["base_url"] = "https://router.huggingface.co/v1"
+            llm_session["available_models"] = model_list
+            if model_list:
+                llm_session["model_name"] = model_list[0]
+            system_health["key_validated"] = True
+            system_health["model_detected"] = len(model_list) > 0
+            system_health["connected"] = True
+            print(f"[BOOT] Auto-authenticated with HF_TOKEN. {len(model_list)} models discovered.")
+        except Exception as e:
+            print(f"[BOOT] Auto-auth failed: {e}")
+    else:
+        print("[BOOT] No HF_TOKEN found. Manual authentication required.")
 @app.middleware("http")
 async def capture_step_latency(request: Request, call_next):
     if request.url.path == "/step":
             // FIX: Robust payload extraction handling regardless of OpenEnv wrapper depth
             const reward = data.reward ?? data.observation?.reward ?? data.info?.reward ?? 0.0;
             const done = data.done ?? data.observation?.done ?? data.info?.done ?? false;
+            // Fetch the authoritative step count from /state
+            let step = 'N/A';
+            try {
+                const stateRes = await fetch('/state');
+                const stateData = await stateRes.json();
+                step = stateData.step_count ?? 'N/A';
+            } catch(se) {}  // Swallow — non-critical
             logMsg(`[RECON] Reward: ${reward.toFixed(4)} | Success`, reward >= 0.8 ? 'success' : 'warn');
         }
     }
+    // Auto-Reset the environment on boot so it actually has data to process,
+    // then try to authenticate with the default HF_TOKEN
     window.addEventListener('DOMContentLoaded', async () => {
         logMsg("Auto-initializing environment engine...", "info");
         await executeReset();
+        logMsg("Attempting default token auth...", "info");
+        await useDefault();
         setInterval(updateState, 1000);
         updateState();
     });

server/fin_auditor_environment.py CHANGED Viewed

@@ -98,14 +98,24 @@ class FinAuditorEnvironment(Environment):
     def reset(self) -> AuditorObservation:
         self._state = State(episode_id=str(uuid4()), step_count=0)
-        # We intentionally return an empty matrix on reset.
-        self.sim_time_ns += self._DELTA_MAX_NS
         self.engine.tick(self.sim_time_ns)
         return FinAuditorObservation(
-            features=[],
-            message="Fin Auditor engine ready.",
             reward=0.001 / self._MAX_EPISODE_STEPS,  # Safe fractional minimum
             done=False
         )

     def reset(self) -> AuditorObservation:
         self._state = State(episode_id=str(uuid4()), step_count=0)
+        # Re-initialize the engine for a clean episode
+        self.engine = hft_auditor.ReconciliationEngine(self._RING_BUFFER_CAPACITY)
+        self.sim_time_ns = 0
+        # Generate the first batch so step 1 has data to evaluate
+        self.engine.generate_batch(self.difficulty, self._INGEST_CHUNK_SIZE, self.sim_time_ns)
+        # Advance time past Δ_max to expire the batch
+        self.sim_time_ns += 6_000_000_000
         self.engine.tick(self.sim_time_ns)
+        # Get the anomaly matrix for the agent (features for step 1)
+        anomalies: list[list[float]] = self.engine.get_anomaly_matrix().tolist()
         return FinAuditorObservation(
+            features=anomalies,
+            message=f"Engine ready. {len(anomalies)} trades awaiting audit.",
             reward=0.001 / self._MAX_EPISODE_STEPS,  # Safe fractional minimum
             done=False
         )