Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

data/data_fetcher.py +16 -68
data/data_loader.py +17 -5
data/ohlc_stats.npz +1 -1
log.log +2 -2
models/model.py +45 -13
scripts/analyze_hyperparams.py +282 -236
scripts/cache_dataset.py +6 -8
train.py +12 -6

data/data_fetcher.py CHANGED Viewed

@@ -628,81 +628,29 @@ class DataFetcher:
     def fetch_trades_for_token(self, token_address: str, T_cutoff: datetime.datetime, count_threshold: int, early_limit: int, recent_limit: int, full_history: bool = False) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
         """
-        Fetches trades for a token.
-        If full_history is True, fetches ALL trades (ignores H/B/H limits).
-        Otherwise, uses the 3-part H/B/H strategy if the total count exceeds a threshold.
-        Returns three lists: early_trades, middle_trades, recent_trades.
         """
         if not token_address:
             return [], [], []
         params = {'token_address': token_address, 'T_cutoff': T_cutoff}
-        # 1. Get the total count if we care about H/B/H logic
-        if not full_history:
-            count_query = "SELECT count() FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s"
-            try:
-                total_trades = self.db_client.execute(count_query, params)[0][0]
-                print(f"INFO: Found {total_trades} total trades for token {token_address} before {T_cutoff}.")
-            except Exception as e:
-                print(f"ERROR: Could not count trades for token {token_address}: {e}")
-                return [], [], []
-        else:
-             total_trades = 0 # Dummy value, ignored
-        # 2. Decide which query to use
-        # If full_history is ON, or count is low, fetch everything.
-        if full_history or total_trades < count_threshold:
-            mode = "Full History" if full_history else "Low Count"
-            # print(f"INFO: Fetching all trades ({mode}).")
-            query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC"
-            try:
-                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
-                if not rows: return [], [], []
-                columns = [col[0] for col in columns_info]
-                all_trades = [dict(zip(columns, row)) for row in rows]
-                # When not using HBH or fetching full history, all trades are considered "early" (or just one big block)
-                return all_trades, [], []
-            except Exception as e:
-                print(f"ERROR: Failed to fetch all trades for token {token_address}: {e}")
-                return [], [], []
-        # 3. Use the H/B/H strategy if the count is high AND not full_history
-        print("INFO: Fetching trades using 3-part High-Def/Blurry/High-Def strategy.")
         try:
-            # Fetch Early (High-Def)
-            early_query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC LIMIT %(limit)s"
-            early_rows, early_cols_info = self.db_client.execute(early_query, {'token_address': token_address, 'T_cutoff': T_cutoff, 'limit': early_limit}, with_column_types=True)
-            early_trades = [dict(zip([c[0] for c in early_cols_info], r)) for r in early_rows] if early_rows else []
-            # Fetch Recent (High-Def)
-            recent_query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp DESC LIMIT %(limit)s"
-            recent_rows, recent_cols_info = self.db_client.execute(recent_query, {'token_address': token_address, 'T_cutoff': T_cutoff, 'limit': recent_limit}, with_column_types=True)
-            recent_trades = [dict(zip([c[0] for c in recent_cols_info], r)) for r in recent_rows] if recent_rows else []
-            recent_trades.reverse() # Order ASC
-            # Fetch Middle (Blurry - successful trades only)
-            middle_trades = []
-            if early_trades and recent_trades:
-                start_middle_ts = early_trades[-1]['timestamp']
-                end_middle_ts = recent_trades[0]['timestamp']
-                if start_middle_ts < end_middle_ts:
-                    middle_query = """
-                    SELECT * FROM trades
-                    WHERE base_address = %(token_address)s
-                      AND success = true
-                      AND timestamp > %(start_ts)s
-                      AND timestamp < %(end_ts)s
-                    ORDER BY timestamp ASC
-                    """
-                    middle_params = {'token_address': token_address, 'start_ts': start_middle_ts, 'end_ts': end_middle_ts}
-                    middle_rows, middle_cols_info = self.db_client.execute(middle_query, middle_params, with_column_types=True)
-                    middle_trades = [dict(zip([c[0] for c in middle_cols_info], r)) for r in middle_rows] if middle_rows else []
-            return early_trades, middle_trades, recent_trades
         except Exception as e:
-            print(f"ERROR: Failed to fetch H/B/H trades for token {token_address}: {e}")
             return [], [], []
     def fetch_future_trades_for_token(self,

     def fetch_trades_for_token(self, token_address: str, T_cutoff: datetime.datetime, count_threshold: int, early_limit: int, recent_limit: int, full_history: bool = False) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
         """
+        Fetches ALL trades for a token up to T_cutoff, ordered by time.
+        Notes:
+        - This intentionally does NOT apply the older fetch-time H/B/H (High-Def / Blurry / High-Def)
+          sampling logic. Sequence-length control is handled later in data_loader.py via event-level
+          head/tail sampling with MIDDLE/RECENT markers.
+        - The function signature still includes legacy H/B/H parameters for compatibility.
+        Returns: (all_trades, [], [])
         """
         if not token_address:
             return [], [], []
         params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC"
         try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return [], [], []
+            columns = [col[0] for col in columns_info]
+            all_trades = [dict(zip(columns, row)) for row in rows]
+            return all_trades, [], []
         except Exception as e:
+            print(f"ERROR: Failed to fetch trades for token {token_address}: {e}")
             return [], [], []
     def fetch_future_trades_for_token(self,

data/data_loader.py CHANGED Viewed

@@ -142,6 +142,10 @@ class OracleDataset(Dataset):
         self.fetcher = data_fetcher
         self.cache_dir = Path(cache_dir) if cache_dir else None
         # If a fetcher is provided, we can determine the number of samples.
         # Otherwise, we are likely in a test mode where __len__ might not be called
@@ -149,7 +153,13 @@ class OracleDataset(Dataset):
         self.t_cutoff_seconds = max(0, int(t_cutoff_seconds or 0))
         self.token_allowlist = set(token_allowlist) if token_allowlist else None
-        if self.cache_dir and self.cache_dir.is_dir():
             print(f"INFO: Initializing dataset in offline (cached) mode from: {self.cache_dir}")
             # Scan for cached files to determine length
             self.cached_files = sorted(self.cache_dir.glob("sample_*.pt"), key=lambda p: int(p.stem.split('_')[1]))
@@ -1201,7 +1211,8 @@ class OracleDataset(Dataset):
              pooler=pooler,
              sample_idx=idx,
              cached_holders_list=raw_data.get('holder_snapshots_list'),
-             cached_ohlc_1s=raw_data.get('ohlc_1s')
         )
     def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
@@ -1394,7 +1405,8 @@ class OracleDataset(Dataset):
             pooler: EmbeddingPooler,
             sample_idx: Optional[int] = None,
             cached_holders_list: List[List[str]] = None,
-            cached_ohlc_1s: Optional[torch.Tensor] = None
         ) -> Optional[Dict[str, Any]]:
         """
         Processes raw token data into a structured dataset item for a specific T_cutoff.
@@ -1683,7 +1695,7 @@ class OracleDataset(Dataset):
                 'embedding_pooler': pooler,
                 'labels': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
                 'labels_mask': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
-                'quality_score': torch.tensor(raw_data['quality_score'], dtype=torch.float32)
             }
         # Ensure sorted
@@ -1759,5 +1771,5 @@ class OracleDataset(Dataset):
             'embedding_pooler': pooler,
             'labels': torch.tensor(label_values, dtype=torch.float32),
             'labels_mask': torch.tensor(mask_values, dtype=torch.float32),
-            'quality_score': torch.tensor(raw_data['quality_score'], dtype=torch.float32)
         }

         self.fetcher = data_fetcher
         self.cache_dir = Path(cache_dir) if cache_dir else None
+        # Always define these so DataLoader workers don't crash with AttributeError if
+        # initialization falls through an unexpected branch.
+        self.cached_files = []
+        self.weights_list = []
         # If a fetcher is provided, we can determine the number of samples.
         # Otherwise, we are likely in a test mode where __len__ might not be called
         self.t_cutoff_seconds = max(0, int(t_cutoff_seconds or 0))
         self.token_allowlist = set(token_allowlist) if token_allowlist else None
+        if self.cache_dir:
+            if not self.cache_dir.is_dir():
+                raise RuntimeError(
+                    f"Cache directory '{self.cache_dir}' was provided but is not a directory. "
+                    "Fix the path or disable cached mode."
+                )
+            # Cached/offline mode
             print(f"INFO: Initializing dataset in offline (cached) mode from: {self.cache_dir}")
             # Scan for cached files to determine length
             self.cached_files = sorted(self.cache_dir.glob("sample_*.pt"), key=lambda p: int(p.stem.split('_')[1]))
              pooler=pooler,
              sample_idx=idx,
              cached_holders_list=raw_data.get('holder_snapshots_list'),
+             cached_ohlc_1s=raw_data.get('ohlc_1s'),
+             quality_score=raw_data.get('quality_score')
         )
     def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
             pooler: EmbeddingPooler,
             sample_idx: Optional[int] = None,
             cached_holders_list: List[List[str]] = None,
+            cached_ohlc_1s: Optional[torch.Tensor] = None,
+            quality_score: Optional[float] = None
         ) -> Optional[Dict[str, Any]]:
         """
         Processes raw token data into a structured dataset item for a specific T_cutoff.
                 'embedding_pooler': pooler,
                 'labels': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
                 'labels_mask': torch.zeros(len(self.horizons_seconds), dtype=torch.float32),
+                'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32)
             }
         # Ensure sorted
             'embedding_pooler': pooler,
             'labels': torch.tensor(label_values, dtype=torch.float32),
             'labels_mask': torch.tensor(mask_values, dtype=torch.float32),
+            'quality_score': torch.tensor(quality_score if quality_score is not None else 0.0, dtype=torch.float32)
         }

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f2c86bf03e5761e7fb319a54274e032f7aa1d01dd5873f2f44a52c9e0be5244
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:46809f070aa1dfcb4f53d7390b1b6ff370e6828e198df4c0df5632ac6fa9f607
 size 1660

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:461e55d31752fd72f09aa30c5bcc3a619654ae86ddf1e759c9c57b0dc5db53f6
-size 21794

 version https://git-lfs.github.com/spec/v1
+oid sha256:41885991264f1522ec8b539dd4f3f738d537102a65103a800578229feef13880
+size 18007

models/model.py CHANGED Viewed

@@ -3,7 +3,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import AutoConfig, AutoModel
 from typing import List, Dict, Any, Optional, Tuple
 import os
 import json
@@ -32,7 +32,7 @@ class Oracle(nn.Module):
                  multi_modal_dim: int,
                  event_pad_id: int,
                  event_type_to_id: Dict[str, int],
-                 model_config_name: str = "Qwen/Qwen3-0.6B",
                  quantiles: List[float] = [0.1, 0.5, 0.9],
                  horizons_seconds: List[int] = [30, 60, 120, 240, 420],
                  dtype: torch.dtype = torch.bfloat16):
@@ -53,12 +53,43 @@ class Oracle(nn.Module):
         self.num_outputs = len(quantiles) * len(horizons_seconds)
         self.dtype = dtype
-        # --- 2. Load Qwen3 Configuration (architecture only; training from scratch) ---
-        hf_token = os.getenv("Hf_TOKEN") or os.getenv("HF_TOKEN")
-        hf_kwargs = {"token": hf_token} if hf_token else {}
-        model_config = AutoConfig.from_pretrained(model_config_name, trust_remote_code=True, **hf_kwargs)
-        self.d_model = model_config.hidden_size
-        self.model = AutoModel.from_config(model_config, trust_remote_code=True)
         self.model.to(self.device, dtype=self.dtype)
         # Quantile prediction head (maps pooled hidden state -> flattened horizon/quantile grid)
@@ -225,8 +256,9 @@ class Oracle(nn.Module):
         # --- NEW: Embedding for timeframe ID (re-uses protocol_embedding) ---
         self.lighthouse_timeframe_embedding = nn.Embedding(vocab.NUM_LIGHTHOUSE_TIMEFRAMES, self.d_model)
-        # --- NEW: Embeddings for Special Context Tokens ---
-        self.special_context_tokens = {'Middle': 0, 'RECENT': 1}
         self.special_context_embedding = nn.Embedding(len(self.special_context_tokens), self.d_model)
@@ -906,19 +938,19 @@ class Oracle(nn.Module):
     def _get_special_context_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         """
-        Handles special context tokens like 'Middle' and 'RECENT' by adding their unique learnable embeddings.
         """
         device = self.device
         event_type_ids = batch['event_type_ids']
         B, L = event_type_ids.shape
-        middle_id = self.event_type_to_id.get('Middle', -1)
         recent_id = self.event_type_to_id.get('RECENT', -1)
         middle_mask = (event_type_ids == middle_id)
         recent_mask = (event_type_ids == recent_id)
-        middle_emb = self.special_context_embedding(torch.tensor(self.special_context_tokens['Middle'], device=device))
         recent_emb = self.special_context_embedding(torch.tensor(self.special_context_tokens['RECENT'], device=device))
         # Add the embeddings at the correct locations

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from transformers import AutoModel, LlamaConfig
 from typing import List, Dict, Any, Optional, Tuple
 import os
 import json
                  multi_modal_dim: int,
                  event_pad_id: int,
                  event_type_to_id: Dict[str, int],
+                 model_config_name: str = "llama3-12l-768d-gqa4-8k-random",
                  quantiles: List[float] = [0.1, 0.5, 0.9],
                  horizons_seconds: List[int] = [30, 60, 120, 240, 420],
                  dtype: torch.dtype = torch.bfloat16):
         self.num_outputs = len(quantiles) * len(horizons_seconds)
         self.dtype = dtype
+        # --- 2. Backbone: Llama-style decoder, RANDOM INIT (no pretrained weights) ---
+        # This gives you RoPE + modern decoder blocks and lets HF use optimized attention
+        # implementations (SDPA / FlashAttention) without us implementing a transformer.
+        #
+        # Size target: ~80-120M params, suitable for 8k-ish seq caps with your data regime.
+        attn_impl = os.getenv("HF_ATTN_IMPL", "sdpa")  # "sdpa" (safe) or "flash_attention_2" (if installed)
+        llama_cfg = LlamaConfig(
+            # Model size
+            hidden_size=768,
+            intermediate_size=3072,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            # GQA-style KV heads (Llama 3-style efficiency knob)
+            num_key_value_heads=4,
+            # Long context (must be >= your effective max sequence length)
+            max_position_embeddings=8192,
+            # Llama 3 uses a large theta; harmless for random init and helps longer contexts.
+            rope_theta=500000.0,
+            rms_norm_eps=1e-5,
+            # Unused when providing inputs_embeds, but required by config
+            vocab_size=32000,
+        )
+        self.d_model = llama_cfg.hidden_size
+        # Older transformers versions may not support attn_implementation in from_config.
+        # Also, flash_attention_2 requires optional deps; fall back to SDPA if unavailable.
+        try:
+            self.model = AutoModel.from_config(llama_cfg, attn_implementation=attn_impl)
+        except TypeError:
+            self.model = AutoModel.from_config(llama_cfg)
+        except Exception:
+            if attn_impl != "sdpa":
+                self.model = AutoModel.from_config(llama_cfg, attn_implementation="sdpa")
+            else:
+                raise
+        # Disable KV cache during training (saves memory; not used for full-seq training).
+        if hasattr(self.model, "config"):
+            self.model.config.use_cache = False
         self.model.to(self.device, dtype=self.dtype)
         # Quantile prediction head (maps pooled hidden state -> flattened horizon/quantile grid)
         # --- NEW: Embedding for timeframe ID (re-uses protocol_embedding) ---
         self.lighthouse_timeframe_embedding = nn.Embedding(vocab.NUM_LIGHTHOUSE_TIMEFRAMES, self.d_model)
+        # --- Embeddings for Special Context Tokens ---
+        # Must match vocabulary event names (see models/vocabulary.py).
+        self.special_context_tokens = {'MIDDLE': 0, 'RECENT': 1}
         self.special_context_embedding = nn.Embedding(len(self.special_context_tokens), self.d_model)
     def _get_special_context_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         """
+        Handles special context tokens like 'MIDDLE' and 'RECENT' by adding their unique learnable embeddings.
         """
         device = self.device
         event_type_ids = batch['event_type_ids']
         B, L = event_type_ids.shape
+        middle_id = self.event_type_to_id.get('MIDDLE', -1)
         recent_id = self.event_type_to_id.get('RECENT', -1)
         middle_mask = (event_type_ids == middle_id)
         recent_mask = (event_type_ids == recent_id)
+        middle_emb = self.special_context_embedding(torch.tensor(self.special_context_tokens['MIDDLE'], device=device))
         recent_emb = self.special_context_embedding(torch.tensor(self.special_context_tokens['RECENT'], device=device))
         # Add the embeddings at the correct locations

scripts/analyze_hyperparams.py CHANGED Viewed

@@ -1,255 +1,301 @@
 import os
-import sys
-import torch
-import numpy as np
 import argparse
-from tqdm import tqdm
-from datetime import datetime, timezone
-from collections import defaultdict
-# Add project root to path
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from data.data_loader import OracleDataset, DataFetcher
-import os
-import sys
-import numpy as np
-import argparse
-from tqdm import tqdm
-from datetime import datetime, timezone
-import collections
-# Add project root to path
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from data.data_loader import DataFetcher
-import os
-import sys
-import numpy as np
-import argparse
-from tqdm import tqdm
-from datetime import datetime, timezone
-import collections
-from dotenv import load_dotenv
-from clickhouse_driver import Client as ClickHouseClient
-from neo4j import GraphDatabase
-# Add project root to path
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from data.data_loader import DataFetcher
-def parse_args():
-    parser = argparse.ArgumentParser(description="Analyze dataset to tune hyperparameters (Horizons, Seq Len)")
-    parser.add_argument("--max_samples", type=int, default=5000, help="Max samples to analyze")
-    parser.add_argument("--token_address", type=str, default=None, help="Specific token address to analyze")
-    return parser.parse_args()
-def main():
     load_dotenv()
     args = parse_args()
-    print("--- Hyperparameter Calibration Analysis (SQL) ---")
-    # DB Connection
-    ch_host = os.getenv("CLICKHOUSE_HOST", "localhost")
-    ch_port = int(os.getenv("CLICKHOUSE_NATIVE_PORT", 9000))
-    neo_uri = os.getenv("NEO4J_URI", "bolt://localhost:7687")
-    neo_user = os.getenv("NEO4J_USER", "neo4j")
-    neo_pass = os.getenv("NEO4J_PASSWORD", "password")
-    print(f"Connecting to ClickHouse at {ch_host}:{ch_port}...")
-    clickhouse_client = ClickHouseClient(host=ch_host, port=ch_port)
-    print(f"Connecting to Neo4j at {neo_uri}...")
-    neo4j_driver = GraphDatabase.driver(neo_uri, auth=(neo_user, neo_pass))
-    # 1. Initialize DataFetcher
-    fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
-    print("DataFetcher initialized.")
-    # 2. Fetch Sample Mints
     if args.token_address:
-        print(f"Analyzing specific token: {args.token_address}")
-        # Try to find mint timestamp
-        query = f"SELECT mint_address, timestamp FROM mints WHERE mint_address = '{args.token_address}'"
-        mints = fetcher.db_client.execute(query)
-        if not mints:
-            print("Token not found in mints table. Trying to use first trade timestamp...")
-            # Fallback if not in mints table
-            q2 = f"SELECT base_address, min(timestamp) FROM trades WHERE base_address = '{args.token_address}' GROUP BY base_address"
-            mints = fetcher.db_client.execute(q2)
-        if not mints:
-            print("Token not found in trades either (or no trades). Exiting.")
             return
     else:
-        print(f"Fetching {args.max_samples} sample tokens...")
-        # Fetch random mints
-        query = f"""
-        SELECT mint_address, timestamp FROM mints
-        ORDER BY rand()
-        LIMIT {args.max_samples}
-        """
-        mints = fetcher.db_client.execute(query)
-        print(f"Fetched {len(mints)} tokens.")
-    # Metrics to collect
-    lifespans = []   # Time from mint to last trade
-    time_to_ath = [] # Time from mint to highest price
-    # Sequence Length estimations
-    windows_to_test = [5, 10, 30, 60] # Minutes
-    event_counts = {w: [] for w in windows_to_test}
-    full_history_counts = []
-    print(f"Analyzing trades for {len(mints)} tokens...")
-    for mint_addr, mint_ts in tqdm(mints):
-        try:
-            if isinstance(mint_ts, datetime) and mint_ts.tzinfo is None:
-                mint_ts = mint_ts.replace(tzinfo=timezone.utc)
-            t0 = mint_ts.timestamp()
-            # Fetch ALL trades for this token
-            # We don't need full enrichments, just timestamp and price
-            # Args: token_addr, T_cutoff, count_threshold, early_lim, recent_lim, full_history
-            now_ts = datetime.now(timezone.utc)
-            trades, _, _ = fetcher.fetch_trades_for_token(mint_addr, now_ts, 0, 0, 0, full_history=True)
-            if not trades: continue
-            # Trades are usually sorted, but ensure
-            trades.sort(key=lambda x: x['timestamp'])
-            # Lifespan
-            last_ts = trades[-1]['timestamp'].timestamp()
-            lifespans.append(last_ts - t0)
-            # Time to ATH
-            max_price = -1.0
-            ath_ts = 0.0
-            valid_trades = []
-            for t in trades:
-                p = float(t.get('price_usd', 0.0))
-                # Basic filter for garbage prints
-                if p > 0:
-                    valid_trades.append(t)
-                    if p > max_price:
-                        max_price = p
-                        ath_ts = t['timestamp'].timestamp()
-            if max_price > 0:
-                time_to_ath.append(ath_ts - t0)
-            # --- Sequence Length Metrics ---
-            full_history_counts.append(len(valid_trades))
-            # Windowed counts
-            counts_in_window = {w: 0 for w in windows_to_test}
-            for t in valid_trades:
-                ts_val = t['timestamp'].timestamp()
-                elapsed_min = (ts_val - t0) / 60.0
-                for w in windows_to_test:
-                    if elapsed_min <= w:
-                        counts_in_window[w] += 1
-            for w in windows_to_test:
-                event_counts[w].append(counts_in_window[w])
-        except Exception as e:
-            print(f"Error processing {mint_addr}: {e}")
-            import traceback
-            traceback.print_exc()
-            pass
-    # --- Stats Calculation ---
-    def print_stats(name, data):
-        if not data:
-            print(f"{name}: No Data")
             return
-        # Convert to numpy array for easier filtering if needed, though they are lists
-        arr = np.array(data)
-        p25 = np.percentile(arr, 25)
-        p50 = np.percentile(arr, 50)
-        p75 = np.percentile(arr, 75)
-        p90 = np.percentile(arr, 90)
-        p95 = np.percentile(arr, 95)
-        p99 = np.percentile(arr, 99)
-        max_val = np.max(arr)
-        print(f"[{name}]")
-        print(f"  Mean: {np.mean(arr):.2f} | Median: {p50:.2f} | Max: {max_val:.2f}")
-        print(f"  25%: {p25:.2f} | 75%: {p75:.2f} | 90%: {p90:.2f} | 95%: {p95:.2f} | 99%: {p99:.2f}")
-    print("\n" + "="*40)
-    print("RESULTS (ALL TOKENS)")
-    print("="*40)
-    # Time Stats
-    lifespans_min = [x/60.0 for x in lifespans]
-    time_to_ath_min = [x/60.0 for x in time_to_ath]
-    print_stats("Token Lifespan (Minutes)", lifespans_min)
-    print("\n")
-    print_stats("Time to ATH (Minutes)", time_to_ath_min)
-    print("\n" + "-"*20)
-    print("SEQUENCE LENGTHS (Trades Only)")
-    print("-"*20)
-    print_stats("Full History Length", full_history_counts)
-    for w in windows_to_test:
-        print("\n")
-        print_stats(f"Trades in First {w} Minutes", event_counts[w])
-    # --- High Activity Subset ---
-    print("\n" + "="*40)
-    print("RESULTS (HIGH ACTIVITY SUBSET)")
-    print("Filter: > 50 trades AND > 5 min lifespan")
-    print("="*40)
-    # Filter indices
-    valid_indices = []
-    for i, count in enumerate(full_history_counts):
-        if count > 50 and lifespans_min[i] > 5.0:
-            valid_indices.append(i)
-    if not valid_indices:
-        print("No high activity tokens found.")
-    else:
-        print(f"Found {len(valid_indices)} high activity tokens out of {len(full_history_counts)}.")
-        subset_lifespans = [lifespans_min[i] for i in valid_indices]
-        subset_ath = [time_to_ath_min[i] for i in valid_indices if i < len(time_to_ath_min)] # careful with length if sizes differ? they shouldn't by logic, but time_to_ath depends on if trade > 0
-        # indices are aligned with loop order
-        # But wait, time_to_ath was appended only if max_price > 0.
-        # This misalignment is risky.
-        # Better: Store dicts or tuples in the main loop instead of parallel lists.
-        # Quick fix: Just recalc stats on lists is hard if not aligned?
-        # Actually time_to_ath might be shorter than lifespans.
-        # Let's just print what we can, assuming simple filtering on `event_counts` which aligns 1:1 with loop (except exceptions).
-        # Re-collect logic for subsets is cleaner if we store objects.
-        # But let's just do Event Counts which are critical for seq_len.
-        subset_history = [full_history_counts[i] for i in valid_indices]
-        print_stats("Subset: Full History Length", subset_history)
-        for w in windows_to_test:
-            subset_w = [event_counts[w][i] for i in valid_indices]
-            print("\n")
-            print_stats(f"Subset: Trades in First {w} Min", subset_w)
-    print("\nRecommendation Logic:")
-    print("1. Horizons: Look at 'Time to ATH' p90 (or p90 of Subset).")
-    print("2. Max Seq Len: Look at 'Trades in First X Minutes' (X ~= Max Horizon).")
 if __name__ == "__main__":
     main()

 import os
 import argparse
+from typing import List, Optional, Sequence, Tuple
+from dotenv import load_dotenv
+from clickhouse_driver import Client as ClickHouseClient
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Fast SQL-based hyperparameter analysis (trades-only) for seq_len + horizons."
+    )
+    parser.add_argument("--token_address", type=str, default=None, help="Analyze a single token address.")
+    parser.add_argument(
+        "--windows_min",
+        type=str,
+        default="5,10,30,60",
+        help="Comma-separated trade-count windows in minutes (e.g. '5,10,30,60').",
+    )
+    parser.add_argument(
+        "--min_price_usd",
+        type=float,
+        default=0.0,
+        help="Treat trades with price_usd <= min_price_usd as invalid (default: 0.0).",
+    )
+    return parser.parse_args()
+def _parse_windows(windows_min: str) -> List[int]:
+    out: List[int] = []
+    for part in (windows_min or "").split(","):
+        part = part.strip()
+        if not part:
+            continue
+        out.append(int(part))
+    out = sorted(set([w for w in out if w > 0]))
+    if not out:
+        raise ValueError("No valid --windows_min provided.")
+    return out
+def _connect_clickhouse_from_env() -> ClickHouseClient:
+    ch_host = os.getenv("CLICKHOUSE_HOST", "localhost")
+    ch_port = int(os.getenv("CLICKHOUSE_NATIVE_PORT", "9000"))
+    ch_user = os.getenv("CLICKHOUSE_USER", None)
+    ch_pass = os.getenv("CLICKHOUSE_PASSWORD", None)
+    ch_db = os.getenv("CLICKHOUSE_DB", None)
+    kwargs = {"host": ch_host, "port": ch_port}
+    if ch_user:
+        kwargs["user"] = ch_user
+    if ch_pass:
+        kwargs["password"] = ch_pass
+    if ch_db:
+        kwargs["database"] = ch_db
+    return ClickHouseClient(**kwargs)
+def _quantile_levels() -> Sequence[float]:
+    # Keep these aligned with the printed labels below.
+    return (0.25, 0.5, 0.75, 0.90, 0.95, 0.99)
+def _fmt_q_tuple(q: Tuple[float, ...]) -> str:
+    # Labels match _quantile_levels()
+    labels = ["25%", "50%", "75%", "90%", "95%", "99%"]
+    parts = []
+    for lbl, v in zip(labels, q):
+        parts.append(f"{lbl}: {float(v):.2f}")
+    return " | ".join(parts)
+def _print_row(prefix: str, mean_v: float, q_tuple: Tuple[float, ...], max_v: float) -> None:
+    print(f"[{prefix}]")
+    print(f"  Mean: {float(mean_v):.2f} | Median: {float(q_tuple[1]):.2f} | Max: {float(max_v):.2f}")
+    print(f"  {_fmt_q_tuple(q_tuple)}")
+def fetch_aggregated_stats_sql(
+    ch: ClickHouseClient,
+    windows_min: List[int],
+    min_price_usd: float,
+    token_address: Optional[str] = None,
+) -> List[tuple]:
+    """
+    One ClickHouse query that computes distribution statistics directly (no per-token loop in Python).
+    Returns two groups:
+      - grp='all'
+      - grp='subset' where trades_full > 50 and lifespan_sec > 300 (5 minutes)
+    """
+    q_levels = _quantile_levels()
+    q_levels_sql = ", ".join(str(q) for q in q_levels)
+    per_token_window_exprs = []
+    agg_window_exprs = []
+    for w in windows_min:
+        sec = int(w) * 60
+        per_token_window_exprs.append(
+            f"countIf(is_valid AND (trade_ts - mint_ts) <= {sec}) AS trades_{w}m"
+        )
+        agg_window_exprs.append(
+            f"avg(trades_{w}m) AS trades_{w}m_mean,"
+            f" quantilesExact({q_levels_sql})(trades_{w}m) AS trades_{w}m_q,"
+            f" max(trades_{w}m) AS trades_{w}m_max"
+        )
+    params = {"min_price": float(min_price_usd)}
+    token_filter = ""
+    if token_address:
+        token_filter = "AND m.mint_address = %(token)s"
+        params["token"] = token_address
+    # Note: we pre-filter trades to only minted tokens for speed.
+    query = f"""
+    WITH
+      per_token AS (
+        SELECT
+          m.mint_address AS mint_address,
+          toUnixTimestamp(m.timestamp) AS mint_ts,
+          countIf(is_valid) AS trades_full,
+          (maxIf(trade_ts, is_valid) - mint_ts) AS lifespan_sec,
+          (toUnixTimestamp(argMaxIf(t.timestamp, t.price_usd, is_valid)) - mint_ts) AS time_to_ath_sec,
+          {", ".join(per_token_window_exprs)}
+        FROM mints AS m
+        INNER JOIN
+        (
+          SELECT
+            base_address,
+            timestamp,
+            toUnixTimestamp(timestamp) AS trade_ts,
+            price_usd,
+            (price_usd > %(min_price)s) AS is_valid
+          FROM trades
+          WHERE base_address IN (SELECT mint_address FROM mints)
+        ) AS t
+        ON t.base_address = m.mint_address
+        WHERE 1=1
+          {token_filter}
+        GROUP BY
+          mint_address,
+          mint_ts
+        HAVING
+          trades_full > 0
+      )
+    SELECT
+      grp,
+      count() AS tokens,
+      avg(trades_full) AS trades_full_mean,
+      quantilesExact({q_levels_sql})(trades_full) AS trades_full_q,
+      max(trades_full) AS trades_full_max,
+      avg(lifespan_sec / 60.0) AS lifespan_min_mean,
+      quantilesExact({q_levels_sql})(lifespan_sec / 60.0) AS lifespan_min_q,
+      max(lifespan_sec / 60.0) AS lifespan_min_max,
+      avg(time_to_ath_sec / 60.0) AS tta_min_mean,
+      quantilesExact({q_levels_sql})(time_to_ath_sec / 60.0) AS tta_min_q,
+      max(time_to_ath_sec / 60.0) AS tta_min_max,
+      {", ".join(agg_window_exprs)}
+    FROM per_token
+    ARRAY JOIN ['all', 'subset'] AS grp
+    WHERE (grp = 'all')
+       OR (grp = 'subset' AND trades_full > 50 AND lifespan_sec > 300)
+    GROUP BY grp
+    ORDER BY grp
+    """
+    return ch.execute(query, params)
+def fetch_single_token_sql(
+    ch: ClickHouseClient,
+    windows_min: List[int],
+    min_price_usd: float,
+    token_address: str,
+) -> Optional[tuple]:
+    per_token_window_exprs = []
+    for w in windows_min:
+        sec = int(w) * 60
+        per_token_window_exprs.append(
+            f"countIf(is_valid AND (trade_ts - mint_ts) <= {sec}) AS trades_{w}m"
+        )
+    params = {"min_price": float(min_price_usd), "token": token_address}
+    query = f"""
+    SELECT
+      m.mint_address AS mint_address,
+      toUnixTimestamp(m.timestamp) AS mint_ts,
+      countIf(is_valid) AS trades_full,
+      (maxIf(trade_ts, is_valid) - mint_ts) AS lifespan_sec,
+      (toUnixTimestamp(argMaxIf(t.timestamp, t.price_usd, is_valid)) - mint_ts) AS time_to_ath_sec,
+      {", ".join(per_token_window_exprs)}
+    FROM mints AS m
+    INNER JOIN
+    (
+      SELECT
+        base_address,
+        timestamp,
+        toUnixTimestamp(timestamp) AS trade_ts,
+        price_usd,
+        (price_usd > %(min_price)s) AS is_valid
+      FROM trades
+      WHERE base_address = %(token)s
+    ) AS t
+    ON t.base_address = m.mint_address
+    WHERE m.mint_address = %(token)s
+    GROUP BY
+      mint_address,
+      mint_ts
+    HAVING
+      trades_full > 0
+    """
+    rows = ch.execute(query, params)
+    return rows[0] if rows else None
+def main() -> None:
     load_dotenv()
     args = parse_args()
+    windows_min = _parse_windows(args.windows_min)
+    print("--- Hyperparameter Calibration Analysis (FAST SQL) ---")
+    print(f"Windows (min): {windows_min}")
+    print(f"Valid trade filter: price_usd > {float(args.min_price_usd)}")
+    ch = _connect_clickhouse_from_env()
     if args.token_address:
+        row = fetch_single_token_sql(
+            ch=ch,
+            windows_min=windows_min,
+            min_price_usd=float(args.min_price_usd),
+            token_address=args.token_address,
+        )
+        if not row:
+            print("Token not found (or no valid trades).")
             return
+        mint_addr = row[0]
+        trades_full = int(row[2])
+        lifespan_min = float(row[3]) / 60.0
+        tta_min = float(row[4]) / 60.0
+        print("\n" + "=" * 40)
+        print("RESULTS (SINGLE TOKEN)")
+        print("=" * 40)
+        print(f"Token: {mint_addr}")
+        print(f"Valid trades: {trades_full}")
+        print(f"Lifespan (min): {lifespan_min:.2f}")
+        print(f"Time to ATH (min): {tta_min:.2f}")
+        for i, w in enumerate(windows_min):
+            print(f"Trades in first {w}m: {int(row[5 + i])}")
     else:
+        rows = fetch_aggregated_stats_sql(
+            ch=ch,
+            windows_min=windows_min,
+            min_price_usd=float(args.min_price_usd),
+            token_address=None,
+        )
+        if not rows:
+            print("No tokens found with valid trades.")
             return
+        print("\n" + "=" * 40)
+        print("RESULTS (DISTRIBUTION)")
+        print("=" * 40)
+        # Row layout:
+        # grp, tokens,
+        # trades_full_mean, trades_full_q(tuple), trades_full_max,
+        # lifespan_min_mean, lifespan_min_q(tuple), lifespan_min_max,
+        # tta_min_mean, tta_min_q(tuple), tta_min_max,
+        # repeated for each window: mean, q(tuple), max
+        for row in rows:
+            grp = row[0]
+            tokens = int(row[1])
+            print(f"\n--- Group: {grp} (tokens={tokens}) ---")
+            _print_row("Trades (Full History, Valid Only)", row[2], row[3], row[4])
+            print("")
+            _print_row("Token Lifespan (Minutes)", row[5], row[6], row[7])
+            print("")
+            _print_row("Time to ATH (Minutes)", row[8], row[9], row[10])
+            cursor = 11
+            for w in windows_min:
+                mean_v = row[cursor]
+                q_v = row[cursor + 1]
+                max_v = row[cursor + 2]
+                cursor += 3
+                print("")
+                _print_row(f"Trades in First {w} Minutes (Valid Only)", mean_v, q_v, max_v)
+    print("\nRecommendation Logic (Trades-only):")
+    print("- Horizons: look at Time-to-ATH p90/p95 (all vs subset).")
+    print("- Max seq len: look at Trades-in-first-(max horizon) p95/p99.")
+    print("  Then add headroom for non-trade events (transfers/pool/liquidity/etc).")
 if __name__ == "__main__":
     main()

scripts/cache_dataset.py CHANGED Viewed

@@ -309,19 +309,17 @@ def main():
                 n_burns = len(item.get("burns", []))
                 n_supply_locks = len(item.get("supply_locks", []))
                 n_migrations = len(item.get("migrations", []))
                 n_ohlc = len(item.get("ohlc_1s", [])) if item.get("ohlc_1s") is not None else 0
                 n_snapshots_5m = len(item.get("snapshots_5m", []))
                 n_holders = len(item.get("holder_snapshots_list", []))
-                tqdm.write(f"  + Cached: {mint_addr} | Class: {class_id} | Q: {q_score:.4f}")
                 tqdm.write(
-                    "    Events | "
-                    f"Trades: {n_trades} | Transfers: {n_transfers} | Pool Creations: {n_pool_creations} | "
-                    f"Liquidity Changes: {n_liquidity_changes} | Fee Collections: {n_fee_collections} | "
-                    f"Burns: {n_burns} | Supply Locks: {n_supply_locks} | Migrations: {n_migrations}"
-                )
-                tqdm.write(
-                    f"    Derived | Mint: 1 | Ohlc 1s: {n_ohlc} | Snapshots 5m: {n_snapshots_5m} | Holder Snapshots: {n_holders}"
                 )
             except Exception as e:

                 n_burns = len(item.get("burns", []))
                 n_supply_locks = len(item.get("supply_locks", []))
                 n_migrations = len(item.get("migrations", []))
+                n_mints = 1 if item.get("mint_timestamp") else 0
                 n_ohlc = len(item.get("ohlc_1s", [])) if item.get("ohlc_1s") is not None else 0
                 n_snapshots_5m = len(item.get("snapshots_5m", []))
                 n_holders = len(item.get("holder_snapshots_list", []))
                 tqdm.write(
+                    f"  + Cached: {mint_addr} | Class: {class_id} | Q: {q_score:.4f} | "
+                    f"Events: Mint {n_mints}, Trades {n_trades}, Transfers {n_transfers}, Pool Creations {n_pool_creations}, "
+                    f"Liquidity Changes {n_liquidity_changes}, Fee Collections {n_fee_collections}, "
+                    f"Burns {n_burns}, Supply Locks {n_supply_locks}, Migrations {n_migrations} | "
+                    f"Derived: Ohlc 1s {n_ohlc}, Snapshots 5m {n_snapshots_5m}, Holder Snapshots {n_holders}"
                 )
             except Exception as e:

train.py CHANGED Viewed

@@ -339,15 +339,21 @@ def main() -> None:
             else:
                  logger.info("INFO: Weights found but shuffle=False. Ignoring weights (sequential mode).")
-    dataloader = DataLoader(
-        dataset,
         batch_size=batch_size,
         shuffle=shuffle,
         sampler=sampler,
         num_workers=int(args.num_workers),
         pin_memory=bool(args.pin_memory),
-        collate_fn=functools.partial(filtered_collate, collator)
     )
     # --- 3. Model Init ---
     logger.info("Initializing Oracle Model...")
@@ -361,16 +367,16 @@ def main() -> None:
         multi_modal_dim=multi_modal_encoder.embedding_dim,
         event_pad_id=vocab.EVENT_TO_ID["__PAD__"],
         event_type_to_id=vocab.EVENT_TO_ID,
-        model_config_name="Qwen/Qwen3-0.6B",
         quantiles=quantiles,
         horizons_seconds=horizons,
         dtype=init_dtype
     )
-    # Memory Optimization: Delete unused embedding layer from Qwen backbone
     if hasattr(model.model, 'embed_tokens'):
         del model.model.embed_tokens
-        logger.info("Freed unused Qwen embedding layer memory.")
     # --- 4. Optimizer & Scheduler ---
     optimizer = AdamW(model.parameters(), lr=learning_rate)

             else:
                  logger.info("INFO: Weights found but shuffle=False. Ignoring weights (sequential mode).")
+    dl_kwargs = dict(
+        dataset=dataset,
         batch_size=batch_size,
         shuffle=shuffle,
         sampler=sampler,
         num_workers=int(args.num_workers),
         pin_memory=bool(args.pin_memory),
+        collate_fn=functools.partial(filtered_collate, collator),
     )
+    if int(args.num_workers) > 0:
+        # Keeps workers alive across epochs. Otherwise each epoch respawns workers and
+        # re-initializes heavy per-worker state (e.g. SigLIP MultiModalEncoder).
+        dl_kwargs["persistent_workers"] = True
+        dl_kwargs["prefetch_factor"] = 2
+    dataloader = DataLoader(**dl_kwargs)
     # --- 3. Model Init ---
     logger.info("Initializing Oracle Model...")
         multi_modal_dim=multi_modal_encoder.embedding_dim,
         event_pad_id=vocab.EVENT_TO_ID["__PAD__"],
         event_type_to_id=vocab.EVENT_TO_ID,
+        model_config_name="llama3-12l-768d-gqa4-8k-random",
         quantiles=quantiles,
         horizons_seconds=horizons,
         dtype=init_dtype
     )
+    # Memory optimization: embedding layer isn't used when providing inputs_embeds.
     if hasattr(model.model, 'embed_tokens'):
         del model.model.embed_tokens
+        logger.info("Freed unused backbone embedding layer memory.")
     # --- 4. Optimizer & Scheduler ---
     optimizer = AdamW(model.parameters(), lr=learning_rate)