Spaces:

KarlQuant
/

Quasar-Executo

Running

App Files Files Community

KarlQuant commited on Apr 24

Commit

a3cb980

verified ·

1 Parent(s): b022ced

Upload Quasar_axrvi_ranker.py

Browse files

Files changed (1) hide show

Quasar_axrvi_ranker.py +272 -42

Quasar_axrvi_ranker.py CHANGED Viewed

@@ -5845,16 +5845,27 @@ class HybridTrainer:
         self.loss_history.append(loss_dict)
         if self.ranker_logger:
             self.ranker_logger.training_update(
                 step=self.train_step,
                 loss=loss.item(),
                 lr=self.optimizer.param_groups[0]["lr"],
-                asset_count=len(valid),
             )
         logger.info(
             f"🧠 [TrainingStep {self.train_step:>6d}] "
             f"total={loss.item():.4f}  "
             f"rl={l_rl_raw.item():.4f}(n={l_rl.item():.4f})  "
             f"ce={l_ce_raw.item():.4f}(n={l_ce.item():.4f})  "
             f"rank={l_rank_raw.item():.4f}  "
@@ -6926,10 +6937,41 @@ class DerivWebSocketClient:
             pass   # best-effort cleanup
     async def send_message(self, msg: dict) -> bool:
         try:
             msg["req_id"] = self._next_msg_id()
-            await self.ws.send(json.dumps(msg))
             return True
         except Exception as e:
             logger.error(f"❌ Send error: {e}")
             return False
@@ -6977,11 +7019,20 @@ class DerivWebSocketClient:
                 symbols_to_restore = list(self._subscribed_symbols)
                 for symbol in symbols_to_restore:
                     try:
-                        await self.ws.send(json.dumps({
-                            "ticks": symbol, "subscribe": 1,
-                            "req_id": self._next_msg_id()
-                        }))
                         logger.info(f"🔄 Re-subscribed to {symbol} after reconnect")
                     except Exception as re_err:
                         logger.warning(f"⚠️  Re-subscription failed for {symbol}: {re_err}")
                 logger.info(
@@ -7061,7 +7112,7 @@ class QuasarAXRVIBridge:
         hub_ws_url:      str  = os.environ.get("QUASAR_HUB_URL", "ws://localhost:7860/ws/subscribe"),
         enable_logging:  bool = True,
         checkpoint_dir:  str  = "./Ranker10",  # new folder for 10-asset build
-        resume:          bool = False,          # FIX: fresh start
         hf_repo_id:      Optional[str] = "KarlQuant/quasar-axrvi-v10",  # new HF repo (10 assets)
     ):
         self.config          = config or AssetRankerConfig()
@@ -7069,6 +7120,23 @@ class QuasarAXRVIBridge:
         self.reward_strategy = reward_strategy
         self.enable_logging  = enable_logging and LOGGING_AVAILABLE
         # ── Checkpoint manager (local + optional HF sync) ─────────────────────
         self.checkpoint_mgr  = RankerCheckpointManager(
             checkpoint_dir=checkpoint_dir,
@@ -7076,6 +7144,26 @@ class QuasarAXRVIBridge:
         )
         self.resume          = resume
         # ── Structured logger (optional) ──────────────────────────────────────
         self.ranker_logger: Optional[object] = None
         self.log_bridge:    Optional[object] = None
@@ -7176,6 +7264,23 @@ class QuasarAXRVIBridge:
         self._last_final_scores:    Optional[np.ndarray] = None
         self._last_value_estimates: Dict[str, float]     = {}
         # [S2] Pending-episode store: keyed by trade_id.
         # s_t is captured at trade-open time; s_{t+1} is captured at close.
         # This gives a proper (s_t, a_t, r_t, s_{t+1}) tuple with s_t ∈ F_t,
@@ -8183,20 +8288,35 @@ class QuasarAXRVIBridge:
                 # has dropped below the 2-trade floor.  If so, trigger a fresh
                 # rank_and_gate() immediately — don't wait for the next scheduled
                 # _rank_loop tick — so the minimum is restored as fast as possible.
                 if closed_any:
                     open_count = len(self.position_mgr.get_open_trades())
                     if open_count < 4:
-                        logger.warning(
-                            f"[monitor_positions] ⚠️  REFILL TRIGGER — "
-                            f"open_count={open_count} < 4 after close. "
-                            f"Calling rank_and_gate() immediately to restore top-4."
-                        )
-                        try:
-                            await self.rank_and_gate()
-                        except Exception as refill_err:
-                            logger.error(
-                                f"[monitor_positions] ❌ Refill rank_and_gate error: {refill_err}"
                             )
                 await asyncio.sleep(2)
             except Exception as e:
@@ -8307,17 +8427,22 @@ class QuasarAXRVIBridge:
         """
         Fire-and-forget HTTP POST of the current AXRVI‑scored ranking list to
         the hub's /api/flip/rankings endpoint, including the real flip_direction.
         """
         if not ranked:
             return
-        try:
-            import urllib.request as _urlreq
-            # Build payload with flip_direction from each asset's snapshot.
-            # FIX: fall back to r.dominant_signal (from rank_risk_neutral, which
-            # reads snap.dominant_signal at ranking time) when the snapshot is
-            # missing or still NEUTRAL — prevents all non-publishing assets from
-            # being broadcast with flip_direction="NEUTRAL" which blocks Gate A.
             rankings_payload = []
             for r in ranked:
                 snap = self.hub_subscriber.get_snapshot(r.space_name)
@@ -8325,15 +8450,15 @@ class QuasarAXRVIBridge:
                 flip_dir = (
                     snap_signal
                     if snap_signal not in ("NEUTRAL", "NONE", None, "")
-                    else r.dominant_signal   # set from snap at rank_risk_neutral time
                 )
                 rankings_payload.append({
                     "space_name":        r.space_name,
                     "score":             r.score,
                     "final_priority":    r.final_priority,
                     "rank":              r.rank,
-                    "dominant_signal":   flip_dir,          # real BUY/SELL/NEUTRAL
-                    "flip_direction":    flip_dir,          # explicit field for executor
                     "avn_accuracy":      r.avn_accuracy,
                     "signal_confidence": r.signal_confidence,
                     "epistemic_std":     r.epistemic_std,
@@ -8342,17 +8467,63 @@ class QuasarAXRVIBridge:
             payload = json.dumps({"rankings": rankings_payload}).encode()
             url = f"{self._hub_http_url}/api/flip/rankings"
-            req = _urlreq.Request(
-                url,
-                data=payload,
-                headers={"Content-Type": "application/json"},
-                method="POST",
-            )
-            _urlreq.urlopen(req, timeout=0.5)
-        except Exception as _e:
-            logger.debug(f"[Rankings] Hub push skipped: {_e}")
     async def rank_and_gate(self) -> None:
         """
         v6/v7 Shreve Ranking Cycle:
           1. Data readiness check
@@ -8870,6 +9041,9 @@ class QuasarAXRVIBridge:
                 self.ws_client.listen(),
                 self._rank_loop(),
                 self.monitor_positions(),
             )
         except asyncio.CancelledError:
             pass
@@ -8899,9 +9073,62 @@ class QuasarAXRVIBridge:
             try:
                 await self.rank_and_gate()
             except Exception as e:
-                logger.error(f"❌ Rank loop error: {e}")
             await asyncio.sleep(self.config.update_frequency_seconds)
     def start_sync(self) -> None:
         """Start in synchronous (threading) mode."""
         def _run_loop():
@@ -10702,7 +10929,7 @@ async def run_live_trading_system(
     enable_logging:  bool = True,
     shreve_config:   Optional[ShreveConfig] = None,
     checkpoint_dir:  str  = "./Ranker10",
-    resume:          bool = False,   # FIX: fresh start
     hf_repo_id:      Optional[str] = "KarlQuant/quasar-axrvi-v10",  # new HF repo (10 assets)
 ) -> None:
     config = AssetRankerConfig(
@@ -11017,10 +11244,13 @@ def _parse_args():
                         help="[S7] Gate E martingale deviation threshold (default 0.05)")
     parser.add_argument("--checkpoint-dir", default="./Ranker10",
                         help="Directory for full-state checkpoints (default ./Ranker10)")
-    parser.add_argument("--no-resume", dest="no_resume", action="store_true", default=True,
-                        help="Default True — always fresh start.")
     parser.add_argument("--resume", dest="no_resume", action="store_false",
-                        help="Restore from latest Ranker10 checkpoint")
     parser.add_argument("--hf-repo", default=None,
                         metavar="OWNER/REPO",
                         help="Hugging Face Dataset repo for checkpoint sync "
@@ -11077,7 +11307,7 @@ if __name__ == "__main__":
         hub_ws_url      = args.hub,
         enable_logging  = not args.no_logs,
         checkpoint_dir  = args.checkpoint_dir,
-        resume          = not args.no_resume,   # default False — always start fresh on Ranker10
         hf_repo_id      = args.hf_repo or "KarlQuant/quasar-axrvi-v10",
     )

         self.loss_history.append(loss_dict)
         if self.ranker_logger:
+            # [LABEL FIX] Previously passed len(valid) here — that's the
+            # BATCH SIZE (number of episodes in this training step), not
+            # the asset count. With TRAIN_BATCH=2 the field always showed
+            # "asset_count=2" which looked like "only 2 of 10 assets are
+            # training", but in fact every episode carries the full
+            # (N=10, T, F) tensor and all 10 assets are trained per step.
+            #
+            # We now pass the TRUE asset count (model.num_assets) so the
+            # dashboard/log reflects reality, and also log batch_size in
+            # the human-readable line below so batch health stays visible.
             self.ranker_logger.training_update(
                 step=self.train_step,
                 loss=loss.item(),
                 lr=self.optimizer.param_groups[0]["lr"],
+                asset_count=self.model.num_assets,
             )
         logger.info(
             f"🧠 [TrainingStep {self.train_step:>6d}] "
             f"total={loss.item():.4f}  "
+            f"assets={self.model.num_assets} batch={len(valid)}/{len(episodes)}  "
             f"rl={l_rl_raw.item():.4f}(n={l_rl.item():.4f})  "
             f"ce={l_ce_raw.item():.4f}(n={l_ce.item():.4f})  "
             f"rank={l_rank_raw.item():.4f}  "
             pass   # best-effort cleanup
     async def send_message(self, msg: dict) -> bool:
+        """
+        Send a message to Deriv with a hard 10s timeout.
+        [HANG FIX — Layer 1] The previous implementation awaited self.ws.send()
+        with no timeout. On a half-open TCP connection (silent proxy drop,
+        NAT table flush, HF Spaces idle reap) the kernel send buffer fills
+        and this await never returns, freezing every caller — including
+        rank_and_gate() via _handle_rank_rotation → _close_position, and
+        via _ensure_minimum_trades → process_axrvi_signal. No more rankings
+        POST to the hub, its 60s TTL expires, dashboard shows 0.0000.
+        On timeout we mark the connection dead and schedule reconnect()
+        asynchronously so we don't block the caller. Callers see False and
+        can react (same as any other send failure).
+        """
         try:
             msg["req_id"] = self._next_msg_id()
+            await asyncio.wait_for(
+                self.ws.send(json.dumps(msg)),
+                timeout=10.0,
+            )
             return True
+        except asyncio.TimeoutError:
+            logger.error(
+                "❌ Deriv ws.send() stalled >10s — connection is half-open. "
+                "Scheduling reconnect."
+            )
+            self.connected = False
+            # Fire-and-forget reconnect so we don't block the hung caller.
+            try:
+                asyncio.get_running_loop().create_task(self.reconnect())
+            except RuntimeError:
+                # No running loop (shouldn't happen here, but be safe).
+                pass
+            return False
         except Exception as e:
             logger.error(f"❌ Send error: {e}")
             return False
                 symbols_to_restore = list(self._subscribed_symbols)
                 for symbol in symbols_to_restore:
                     try:
+                        # [HANG FIX — Layer 1] Same 10s cap as send_message so a
+                        # stalled re-subscribe can't hang the reconnect task.
+                        await asyncio.wait_for(
+                            self.ws.send(json.dumps({
+                                "ticks": symbol, "subscribe": 1,
+                                "req_id": self._next_msg_id()
+                            })),
+                            timeout=10.0,
+                        )
                         logger.info(f"🔄 Re-subscribed to {symbol} after reconnect")
+                    except asyncio.TimeoutError:
+                        logger.warning(
+                            f"⚠️  Re-subscription to {symbol} timed out — will retry on next reconnect"
+                        )
                     except Exception as re_err:
                         logger.warning(f"⚠️  Re-subscription failed for {symbol}: {re_err}")
                 logger.info(
         hub_ws_url:      str  = os.environ.get("QUASAR_HUB_URL", "ws://localhost:7860/ws/subscribe"),
         enable_logging:  bool = True,
         checkpoint_dir:  str  = "./Ranker10",  # new folder for 10-asset build
+        resume:          bool = True,           # [RESUME FIX] default ON — see __init__
         hf_repo_id:      Optional[str] = "KarlQuant/quasar-axrvi-v10",  # new HF repo (10 assets)
     ):
         self.config          = config or AssetRankerConfig()
         self.reward_strategy = reward_strategy
         self.enable_logging  = enable_logging and LOGGING_AVAILABLE
+        # ── [RESUME FIX] Environment variable override ────────────────────────
+        # HF Spaces entrypoints usually can't pass CLI flags — they just run
+        # `python Quasar_axrvi_ranker.py`. To control resume behaviour there,
+        # set the QUASAR_RESUME environment variable in the Space's secrets:
+        #   QUASAR_RESUME=0 / false / no  → start fresh (overrides constructor)
+        #   QUASAR_RESUME=1 / true / yes  → resume from latest checkpoint
+        #   (unset)                        → use the constructor argument
+        _env_resume = os.environ.get("QUASAR_RESUME", "").strip().lower()
+        if _env_resume in ("0", "false", "no", "off"):
+            logger.warning(
+                "[RESUME] QUASAR_RESUME env var forces fresh start "
+                "(overriding resume=True constructor argument)"
+            )
+            resume = False
+        elif _env_resume in ("1", "true", "yes", "on"):
+            resume = True
         # ── Checkpoint manager (local + optional HF sync) ─────────────────────
         self.checkpoint_mgr  = RankerCheckpointManager(
             checkpoint_dir=checkpoint_dir,
         )
         self.resume          = resume
+        # ── [RESUME FIX] Startup banner ───────────────────────────────────────
+        # Prints resume state + HF sync status in a single eyeballable block so
+        # you can tell at a glance whether checkpoints will actually be used
+        # and mirrored. The most common failure mode on Spaces is a missing
+        # HF_TOKEN secret — that goes silent without this banner.
+        _hf_enabled = self.checkpoint_mgr._hf.enabled
+        _hf_token   = "✅ set" if os.environ.get("HF_TOKEN") else "❌ missing"
+        _hf_repo    = os.environ.get("HF_REPO_ID") or hf_repo_id or "—"
+        logger.info(
+            "\n" + "═" * 66 + "\n"
+            f"  📦 CHECKPOINT CONFIG\n"
+            f"    resume       : {self.resume}  "
+            f"({'will attempt to restore on start' if self.resume else 'FRESH START — no restore'})\n"
+            f"    checkpoint_dir: {checkpoint_dir}\n"
+            f"    hf_repo       : {_hf_repo}\n"
+            f"    hf_token      : {_hf_token}\n"
+            f"    hf_sync       : {'✅ enabled' if _hf_enabled else '❌ disabled (set HF_TOKEN + HF_REPO_ID)'}\n"
+            + "═" * 66
+        )
         # ── Structured logger (optional) ──────────────────────────────────────
         self.ranker_logger: Optional[object] = None
         self.log_bridge:    Optional[object] = None
         self._last_final_scores:    Optional[np.ndarray] = None
         self._last_value_estimates: Dict[str, float]     = {}
+        # ── [HANG FIX — Layer 2] Re-entrancy guard for rank_and_gate ──────────
+        # Both _rank_loop and monitor_positions' refill trigger call
+        # rank_and_gate(). Without this lock they can enter concurrently and
+        # corrupt shared state (rank_count, _last_value_estimates, the ranked
+        # list, and the position manager). The lock is created lazily in
+        # rank_and_gate() itself because asyncio.Lock() must be created inside
+        # a running event loop on Python <3.10.
+        self._rank_lock: Optional[asyncio.Lock] = None
+        # ── [HANG FIX — Layer 3] Watchdog timestamp ───────────────────────────
+        # Updated at the END of every successful rank_and_gate() cycle.
+        # _rank_watchdog() checks this and force-closes the ws (triggering
+        # reconnect) if no cycle has completed within RANK_STALL_THRESHOLD_S.
+        # This is the safety net that recovers from ANY cause of stall —
+        # not just the ws.send() one we know about.
+        self._last_rank_complete_ts: float = time.time()
         # [S2] Pending-episode store: keyed by trade_id.
         # s_t is captured at trade-open time; s_{t+1} is captured at close.
         # This gives a proper (s_t, a_t, r_t, s_{t+1}) tuple with s_t ∈ F_t,
                 # has dropped below the 2-trade floor.  If so, trigger a fresh
                 # rank_and_gate() immediately — don't wait for the next scheduled
                 # _rank_loop tick — so the minimum is restored as fast as possible.
+                #
+                # [HANG FIX — Layer 2] Only trigger a refill if rank_and_gate
+                # is NOT currently running. Re-entering rank_and_gate while the
+                # scheduled _rank_loop call is still inside it corrupts shared
+                # state and can deadlock on the Deriv WS. If it's running, the
+                # next scheduled tick will pick up the refill need within
+                # update_frequency_seconds — acceptable latency vs a deadlock.
                 if closed_any:
                     open_count = len(self.position_mgr.get_open_trades())
                     if open_count < 4:
+                        if self._rank_lock is not None and self._rank_lock.locked():
+                            logger.info(
+                                f"[monitor_positions] ⏩ REFILL SKIPPED — "
+                                f"rank_and_gate already running; next scheduled "
+                                f"tick will restore top-4 (open_count={open_count})"
+                            )
+                        else:
+                            logger.warning(
+                                f"[monitor_positions] ⚠️  REFILL TRIGGER — "
+                                f"open_count={open_count} < 4 after close. "
+                                f"Calling rank_and_gate() immediately to restore top-4."
                             )
+                            try:
+                                await self.rank_and_gate()
+                            except Exception as refill_err:
+                                logger.error(
+                                    f"[monitor_positions] ❌ Refill rank_and_gate error: {refill_err}",
+                                    exc_info=True,
+                                )
                 await asyncio.sleep(2)
             except Exception as e:
         """
         Fire-and-forget HTTP POST of the current AXRVI‑scored ranking list to
         the hub's /api/flip/rankings endpoint, including the real flip_direction.
+        [HANG FIX — Layers 4 & 5]
+          • Previously the timeout was 0.5s and failures logged at DEBUG, so
+            transient hub slowness silently dropped rankings with no visibility.
+          • Now: timeout=3.0s, failures logged at WARNING.
+          • The HTTP POST runs in a thread-pool executor so hub latency never
+            blocks the async rank loop — even if the hub is completely dead,
+            the ranker keeps producing rankings.
         """
         if not ranked:
             return
+        # Capture the payload here (in the async caller's context) so the
+        # snapshot read is consistent, then hand the blocking HTTP POST to
+        # the executor.
+        try:
             rankings_payload = []
             for r in ranked:
                 snap = self.hub_subscriber.get_snapshot(r.space_name)
                 flip_dir = (
                     snap_signal
                     if snap_signal not in ("NEUTRAL", "NONE", None, "")
+                    else r.dominant_signal
                 )
                 rankings_payload.append({
                     "space_name":        r.space_name,
                     "score":             r.score,
                     "final_priority":    r.final_priority,
                     "rank":              r.rank,
+                    "dominant_signal":   flip_dir,
+                    "flip_direction":    flip_dir,
                     "avn_accuracy":      r.avn_accuracy,
                     "signal_confidence": r.signal_confidence,
                     "epistemic_std":     r.epistemic_std,
             payload = json.dumps({"rankings": rankings_payload}).encode()
             url = f"{self._hub_http_url}/api/flip/rankings"
+        except Exception as build_err:
+            logger.warning(f"[Rankings] Payload build failed: {build_err}")
+            return
+        def _do_post() -> None:
+            """Blocking HTTP POST — runs on a worker thread, not the event loop."""
+            try:
+                import urllib.request as _urlreq
+                req = _urlreq.Request(
+                    url,
+                    data=payload,
+                    headers={"Content-Type": "application/json"},
+                    method="POST",
+                )
+                _urlreq.urlopen(req, timeout=3.0)
+            except Exception as post_err:
+                # Warn-level so we see repeated failures in the log and can
+                # diagnose whether the hub is the problem next time.
+                logger.warning(f"[Rankings] Hub push failed: {post_err}")
+        try:
+            loop = asyncio.get_running_loop()
+            loop.run_in_executor(None, _do_post)
+        except RuntimeError:
+            # No running loop (e.g. called from sync context) — do it inline.
+            _do_post()
     async def rank_and_gate(self) -> None:
+        """
+        [HANG FIX — Layers 2 & 3] Thin wrapper around _rank_and_gate_impl
+        that holds self._rank_lock for the duration of the cycle (so
+        _rank_loop and the monitor_positions refill trigger cannot re-enter
+        concurrently) and stamps self._last_rank_complete_ts so the watchdog
+        can detect stalls.
+        Also logs cycle timing at WARNING level if a cycle exceeds 5s —
+        slow cycles are the early symptom of a pending hang.
+        """
+        # asyncio.Lock must be created inside a running loop on Python <3.10
+        if self._rank_lock is None:
+            self._rank_lock = asyncio.Lock()
+        t0 = time.time()
+        async with self._rank_lock:
+            try:
+                await self._rank_and_gate_impl()
+            finally:
+                # Stamp even on failure: a fast failure is healthier than a
+                # stalled coroutine. The watchdog should only fire when the
+                # rank loop is actually stuck, not when it's crashing loudly.
+                self._last_rank_complete_ts = time.time()
+        elapsed = time.time() - t0
+        if elapsed > 5.0:
+            logger.warning(f"[RankCycle] completed slowly in {elapsed:.2f}s")
+    async def _rank_and_gate_impl(self) -> None:
         """
         v6/v7 Shreve Ranking Cycle:
           1. Data readiness check
                 self.ws_client.listen(),
                 self._rank_loop(),
                 self.monitor_positions(),
+                # [HANG FIX — Layer 3] Watchdog task: auto-recover if the rank
+                # loop stops completing cycles (ws.send() hang, deadlock, etc.)
+                self._rank_watchdog(),
             )
         except asyncio.CancelledError:
             pass
             try:
                 await self.rank_and_gate()
             except Exception as e:
+                # [HANG FIX — Layer 4] exc_info=True so we get a full traceback
+                # in the ranker log the next time something breaks silently.
+                # Previously errors were logged as a one-liner with no stack
+                # frame, making root-cause diagnosis impossible.
+                logger.error(f"❌ Rank loop error: {e}", exc_info=True)
             await asyncio.sleep(self.config.update_frequency_seconds)
+    # ── [HANG FIX — Layer 3] Rank-loop watchdog ───────────────────────────────
+    # Periodically checks self._last_rank_complete_ts. If no rank cycle has
+    # completed within RANK_STALL_THRESHOLD_S, assumes the rank loop is hung
+    # (almost always on a ws.send() to a half-open Deriv connection) and
+    # force-closes the websocket. That raises ConnectionClosed inside any
+    # pending send/recv, unblocking the await and triggering reconnect().
+    #
+    # This is the safety net: even if a NEW bug introduces a different hang,
+    # the system auto-recovers within RANK_STALL_THRESHOLD_S instead of
+    # sitting dead until a human restarts the Space.
+    RANK_STALL_THRESHOLD_S: float = 120.0   # 4× the expected worst-case cycle
+    RANK_WATCHDOG_POLL_S:   float = 30.0
+    async def _rank_watchdog(self) -> None:
+        """Force-close the Deriv ws if the rank loop stops completing cycles."""
+        logger.info(
+            f"🐕 [RankWatchdog] started | stall_threshold={self.RANK_STALL_THRESHOLD_S}s "
+            f"| poll_interval={self.RANK_WATCHDOG_POLL_S}s"
+        )
+        while self.running:
+            await asyncio.sleep(self.RANK_WATCHDOG_POLL_S)
+            if not self.running:
+                break
+            since_last = time.time() - self._last_rank_complete_ts
+            if since_last > self.RANK_STALL_THRESHOLD_S:
+                logger.critical(
+                    f"🚨 [RankWatchdog] Rank loop has not completed a cycle for "
+                    f"{since_last:.0f}s (threshold={self.RANK_STALL_THRESHOLD_S}s). "
+                    f"Forcing Deriv ws close to unblock any pending send()."
+                )
+                # Force-close the ws. This raises ConnectionClosed inside
+                # whatever coroutine is awaiting ws.send() or ws.recv(),
+                # unblocking the rank loop. listen()'s except branch will
+                # then drive reconnect().
+                try:
+                    if self.ws_client and self.ws_client.ws:
+                        await self.ws_client.ws.close()
+                except Exception as close_err:
+                    logger.warning(
+                        f"[RankWatchdog] ws.close() raised (expected on dead socket): {close_err}"
+                    )
+                # Reset the stamp so we don't spam CRITICAL every poll interval
+                # while reconnect is in progress.
+                self._last_rank_complete_ts = time.time()
+            else:
+                logger.debug(
+                    f"[RankWatchdog] healthy | last_cycle={since_last:.1f}s ago"
+                )
     def start_sync(self) -> None:
         """Start in synchronous (threading) mode."""
         def _run_loop():
     enable_logging:  bool = True,
     shreve_config:   Optional[ShreveConfig] = None,
     checkpoint_dir:  str  = "./Ranker10",
+    resume:          bool = True,    # [RESUME FIX] default ON — env var QUASAR_RESUME overrides
     hf_repo_id:      Optional[str] = "KarlQuant/quasar-axrvi-v10",  # new HF repo (10 assets)
 ) -> None:
     config = AssetRankerConfig(
                         help="[S7] Gate E martingale deviation threshold (default 0.05)")
     parser.add_argument("--checkpoint-dir", default="./Ranker10",
                         help="Directory for full-state checkpoints (default ./Ranker10)")
+    # [RESUME FIX] Default is now --resume (load latest checkpoint).
+    # Pass --no-resume to deliberately start fresh.
+    # Env var QUASAR_RESUME=0|false further overrides this in the bridge.
+    parser.add_argument("--no-resume", dest="no_resume", action="store_true", default=False,
+                        help="Start fresh, ignoring any existing checkpoint (default: resume).")
     parser.add_argument("--resume", dest="no_resume", action="store_false",
+                        help="Restore from the latest Ranker10 checkpoint (default).")
     parser.add_argument("--hf-repo", default=None,
                         metavar="OWNER/REPO",
                         help="Hugging Face Dataset repo for checkpoint sync "
         hub_ws_url      = args.hub,
         enable_logging  = not args.no_logs,
         checkpoint_dir  = args.checkpoint_dir,
+        resume          = not args.no_resume,   # [RESUME FIX] default True — env var QUASAR_RESUME overrides
         hf_repo_id      = args.hf_repo or "KarlQuant/quasar-axrvi-v10",
     )