Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.gitignore +3 -2
data/data_collator.py +30 -12
data/data_fetcher.py +57 -27
data/data_loader.py +202 -79
install.sh +26 -0
log.log +2 -2
models/model.py +76 -0
models/multi_modal_processor.py +5 -2
models/ohlc_embedder.py +3 -3
train.py +35 -27
train.sh +3 -5

.gitignore CHANGED Viewed

@@ -6,9 +6,10 @@ __pycache__/
 runs/
 data/pump_fun
 .env
 data/cache
 .tmp/
-.cache/

 runs/
 data/pump_fun
+data/cache
 .env
 data/cache
 .tmp/
+.cache/
+checkpoints/

data/data_collator.py CHANGED Viewed

@@ -6,11 +6,26 @@ from torch.nn.utils.rnn import pad_sequence
 from typing import List, Dict, Any, Tuple, Optional, Union
 from collections import defaultdict
 from PIL import Image
-from models.multi_modal_processor import MultiModalEncoder
-# Encoders are NO LONGER imported here
-import models.vocabulary as vocab # For IDs, config sizes
-from data.data_loader import EmbeddingPooler # Import for type hinting and instantiation
 NATIVE_MINT = "So11111111111111111111111111111111111111112"
 QUOTE_MINTS = {
@@ -28,19 +43,19 @@ class MemecoinCollator:
     def __init__(self,
                  event_type_to_id: Dict[str, int],
                  device: torch.device,
-                 multi_modal_encoder: MultiModalEncoder,
                  dtype: torch.dtype,
-                 ohlc_seq_len: int = 300,
-                 max_seq_len: Optional[int] = None
                 ):
         self.event_type_to_id = event_type_to_id
         self.pad_token_id = event_type_to_id.get('__PAD__', 0)
-        self.multi_modal_encoder = multi_modal_encoder
         self.entity_pad_idx = 0
         self.device = device
         self.dtype = dtype
-        self.ohlc_seq_len = ohlc_seq_len
         self.max_seq_len = max_seq_len
     def _collate_features_for_encoder(self, entities: List[Dict], feature_keys: List[str], device: torch.device, entity_type: str) -> Dict[str, Any]:
@@ -205,12 +220,15 @@ class MemecoinCollator:
         all_items_sorted = batch_wide_pooler.get_all_items()
         texts_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], str)]
         images_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], Image.Image)]
-        text_embeds = self.multi_modal_encoder(texts_to_encode) if texts_to_encode else torch.empty(0)
-        image_embeds = self.multi_modal_encoder(images_to_encode) if images_to_encode else torch.empty(0)
         # Create the final lookup tensor and fill it based on original item type
-        batch_embedding_pool = torch.zeros(len(all_items_sorted), self.multi_modal_encoder.embedding_dim, device=self.device, dtype=self.dtype)
         text_cursor, image_cursor = 0, 0
         for i, item_data in enumerate(all_items_sorted):
             if isinstance(item_data['item'], str):

 from typing import List, Dict, Any, Tuple, Optional, Union
 from collections import defaultdict
 from PIL import Image
+# --- GLOBAL SINGLETON FOR WORKER PROCESSES ---
+_WORKER_ENCODER = None
+def _get_worker_encoder(model_id: str, dtype: torch.dtype, device: torch.device):
+    """
+    Lazy-loads the encoder on the worker process.
+    FORCED TO CPU to save VRAM when using multiple workers.
+    """
+    global _WORKER_ENCODER
+    if _WORKER_ENCODER is None:
+        print(f"[Worker] Initializing MultiModalEncoder (SigLIP) on CPU (VRAM optimization)...")
+        # Local import to avoid top-level dependency issues
+        from models.multi_modal_processor import MultiModalEncoder
+        # Explicitly pass device="cpu"
+        _WORKER_ENCODER = MultiModalEncoder(model_id=model_id, dtype=dtype, device="cpu")
+    return _WORKER_ENCODER
+import models.vocabulary as vocab
+from data.data_loader import EmbeddingPooler
 NATIVE_MINT = "So11111111111111111111111111111111111111112"
 QUOTE_MINTS = {
     def __init__(self,
                  event_type_to_id: Dict[str, int],
                  device: torch.device,
                  dtype: torch.dtype,
+                 max_seq_len: Optional[int] = None,
+                 model_id: str = "google/siglip-so400m-patch16-256-i18n"
                 ):
         self.event_type_to_id = event_type_to_id
         self.pad_token_id = event_type_to_id.get('__PAD__', 0)
+        # self.multi_modal_encoder = multi_modal_encoder # DEPRECATED
+        self.model_id = model_id
         self.entity_pad_idx = 0
         self.device = device
         self.dtype = dtype
+        self.ohlc_seq_len = 300 # HARDCODED
         self.max_seq_len = max_seq_len
     def _collate_features_for_encoder(self, entities: List[Dict], feature_keys: List[str], device: torch.device, entity_type: str) -> Dict[str, Any]:
         all_items_sorted = batch_wide_pooler.get_all_items()
         texts_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], str)]
         images_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], Image.Image)]
+        # LAZY LOAD ENCODER
+        encoder = _get_worker_encoder(self.model_id, self.dtype, self.device)
+        text_embeds = encoder(texts_to_encode).to(self.device) if texts_to_encode else torch.empty(0)
+        image_embeds = encoder(images_to_encode).to(self.device) if images_to_encode else torch.empty(0)
         # Create the final lookup tensor and fill it based on original item type
+        batch_embedding_pool = torch.zeros(len(all_items_sorted), encoder.embedding_dim, device=self.device, dtype=self.dtype)
         text_cursor, image_cursor = 0, 0
         for i, item_data in enumerate(all_items_sorted):
             if isinstance(item_data['item'], str):

data/data_fetcher.py CHANGED Viewed

@@ -626,9 +626,11 @@ class DataFetcher:
         return token_details
-    def fetch_trades_for_token(self, token_address: str, T_cutoff: datetime.datetime, count_threshold: int, early_limit: int, recent_limit: int) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
         """
-        Fetches trades for a token, using a 3-part H/B/H strategy if the total count exceeds a threshold.
         Returns three lists: early_trades, middle_trades, recent_trades.
         """
         if not token_address:
@@ -636,31 +638,36 @@ class DataFetcher:
         params = {'token_address': token_address, 'T_cutoff': T_cutoff}
-        # 1. Get the total count of trades for the token before the cutoff
-        count_query = "SELECT count() FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s"
-        try:
-            total_trades = self.db_client.execute(count_query, params)[0][0]
-            print(f"INFO: Found {total_trades} total trades for token {token_address} before {T_cutoff}.")
-        except Exception as e:
-            print(f"ERROR: Could not count trades for token {token_address}: {e}")
-            return [], [], []
-        # 2. Decide which query to use based on the count
-        if total_trades < count_threshold:
-            print("INFO: Fetching all trades (count is below H/B/H threshold).")
             query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC"
             try:
                 rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
                 if not rows: return [], [], []
                 columns = [col[0] for col in columns_info]
                 all_trades = [dict(zip(columns, row)) for row in rows]
-                # When not using HBH, all trades are considered "early"
                 return all_trades, [], []
             except Exception as e:
                 print(f"ERROR: Failed to fetch all trades for token {token_address}: {e}")
                 return [], [], []
-        # 3. Use the H/B/H strategy if the count is high
         print("INFO: Fetching trades using 3-part High-Def/Blurry/High-Def strategy.")
         try:
             # Fetch Early (High-Def)
@@ -792,7 +799,7 @@ class DataFetcher:
         ORDER BY timestamp ASC
         """
         params = {'token_address': token_address, 'T_cutoff': T_cutoff}
-        print(f"INFO: Fetching pool creation events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
@@ -831,7 +838,7 @@ class DataFetcher:
         ORDER BY timestamp ASC
         """
         params = {'pool_addresses': pool_addresses, 'T_cutoff': T_cutoff}
-        print(f"INFO: Fetching liquidity change events for {len(pool_addresses)} pool(s).")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
@@ -870,7 +877,7 @@ class DataFetcher:
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
-        print(f"INFO: Fetching fee collection events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
@@ -908,7 +915,7 @@ class DataFetcher:
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
-        print(f"INFO: Fetching migrations for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
             if not rows:
@@ -946,7 +953,7 @@ class DataFetcher:
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
-        print(f"INFO: Fetching burn events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
@@ -987,7 +994,7 @@ class DataFetcher:
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
-        print(f"INFO: Fetching supply lock events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
@@ -1020,7 +1027,7 @@ class DataFetcher:
         LIMIT %(limit)s;
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff, 'limit': int(limit)}
-        print(f"INFO: Fetching top holders for snapshot for {token_address} (limit {limit}).")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
             if not rows:
@@ -1050,7 +1057,7 @@ class DataFetcher:
         WHERE rn_per_holding = 1 AND current_balance > 0;
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
-        print(f"INFO: Counting total holders for {token_address} at cutoff.")
         try:
             rows = self.db_client.execute(query, params)
             if not rows:
@@ -1067,12 +1074,20 @@ class DataFetcher:
         max_horizon_seconds: int = 3600,
         include_wallet_data: bool = True,
         include_graph: bool = True,
-        min_trades: int = 0
     ) -> Optional[Dict[str, Any]]:
         """
         Fetches ALL available data for a token up to the maximum horizon.
         This data is agnostic of T_cutoff and will be masked/filtered dynamically during training.
         Wallet/graph data can be skipped to avoid caching T_cutoff-dependent features.
         """
         # 1. Calculate the absolute maximum timestamp we care about (mint + max_horizon)
@@ -1086,8 +1101,9 @@ class DataFetcher:
         # So we pass max_limit_time as the "cutoff" for the purpose of raw data collection.
         # We use a large enough limit to get all relevant trades for the session
         early_trades, middle_trades, recent_trades = self.fetch_trades_for_token(
-            token_address, max_limit_time, 30000, 10000, 15000
         )
         # Combine and deduplicate trades
@@ -1099,12 +1115,26 @@ class DataFetcher:
         sorted_trades = sorted(list(all_trades.values()), key=lambda x: x['timestamp'])
         if len(sorted_trades) < min_trades:
             print(f"  SKIP: Token {token_address} has only {len(sorted_trades)} trades (min required: {min_trades}). skipping fetches.")
             return None
         # 3. Fetch other events
-        transfers = self.fetch_transfers_for_token(token_address, max_limit_time, 0.0) # 0.0 means fetch all
         pool_creations = self.fetch_pool_creations_for_token(token_address, max_limit_time)
         # Collect pool addresses to fetch liquidity changes

         return token_details
+    def fetch_trades_for_token(self, token_address: str, T_cutoff: datetime.datetime, count_threshold: int, early_limit: int, recent_limit: int, full_history: bool = False) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
         """
+        Fetches trades for a token.
+        If full_history is True, fetches ALL trades (ignores H/B/H limits).
+        Otherwise, uses the 3-part H/B/H strategy if the total count exceeds a threshold.
         Returns three lists: early_trades, middle_trades, recent_trades.
         """
         if not token_address:
         params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        # 1. Get the total count if we care about H/B/H logic
+        if not full_history:
+            count_query = "SELECT count() FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s"
+            try:
+                total_trades = self.db_client.execute(count_query, params)[0][0]
+                print(f"INFO: Found {total_trades} total trades for token {token_address} before {T_cutoff}.")
+            except Exception as e:
+                print(f"ERROR: Could not count trades for token {token_address}: {e}")
+                return [], [], []
+        else:
+             total_trades = 0 # Dummy value, ignored
+        # 2. Decide which query to use
+        # If full_history is ON, or count is low, fetch everything.
+        if full_history or total_trades < count_threshold:
+            mode = "Full History" if full_history else "Low Count"
+            # print(f"INFO: Fetching all trades ({mode}).")
             query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC"
             try:
                 rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
                 if not rows: return [], [], []
                 columns = [col[0] for col in columns_info]
                 all_trades = [dict(zip(columns, row)) for row in rows]
+                # When not using HBH or fetching full history, all trades are considered "early" (or just one big block)
                 return all_trades, [], []
             except Exception as e:
                 print(f"ERROR: Failed to fetch all trades for token {token_address}: {e}")
                 return [], [], []
+        # 3. Use the H/B/H strategy if the count is high AND not full_history
         print("INFO: Fetching trades using 3-part High-Def/Blurry/High-Def strategy.")
         try:
             # Fetch Early (High-Def)
         ORDER BY timestamp ASC
         """
         params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching pool creation events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
         ORDER BY timestamp ASC
         """
         params = {'pool_addresses': pool_addresses, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching liquidity change events for {len(pool_addresses)} pool(s).")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching fee collection events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching migrations for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
             if not rows:
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching burn events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
         ORDER BY timestamp ASC
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching supply lock events for {token_address}.")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
         LIMIT %(limit)s;
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff, 'limit': int(limit)}
+        # print(f"INFO: Fetching top holders for snapshot for {token_address} (limit {limit}).")
         try:
             rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
             if not rows:
         WHERE rn_per_holding = 1 AND current_balance > 0;
         """
         params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Counting total holders for {token_address} at timestamp {T_cutoff}.")
         try:
             rows = self.db_client.execute(query, params)
             if not rows:
         max_horizon_seconds: int = 3600,
         include_wallet_data: bool = True,
         include_graph: bool = True,
+        min_trades: int = 0,
+        full_history: bool = False,
+        prune_failed: bool = False,
+        prune_transfers: bool = False
     ) -> Optional[Dict[str, Any]]:
         """
         Fetches ALL available data for a token up to the maximum horizon.
         This data is agnostic of T_cutoff and will be masked/filtered dynamically during training.
         Wallet/graph data can be skipped to avoid caching T_cutoff-dependent features.
+        Args:
+            full_history: If True, fetches ALL trades ignoring H/B/H limits.
+            prune_failed: If True, filters out failed trades from the result.
+            prune_transfers: If True, skips fetching transfers entirely.
         """
         # 1. Calculate the absolute maximum timestamp we care about (mint + max_horizon)
         # So we pass max_limit_time as the "cutoff" for the purpose of raw data collection.
         # We use a large enough limit to get all relevant trades for the session
+        # If full_history is True, these limits are ignored inside the method.
         early_trades, middle_trades, recent_trades = self.fetch_trades_for_token(
+            token_address, max_limit_time, 30000, 10000, 15000, full_history=full_history
         )
         # Combine and deduplicate trades
         sorted_trades = sorted(list(all_trades.values()), key=lambda x: x['timestamp'])
+        # --- PRUNING FAILED TRADES ---
+        if prune_failed:
+            original_count = len(sorted_trades)
+            sorted_trades = [t for t in sorted_trades if t.get('success', False)]
+            if len(sorted_trades) < original_count:
+                # print(f"  INFO: Pruned {original_count - len(sorted_trades)} failed trades.")
+                pass
         if len(sorted_trades) < min_trades:
             print(f"  SKIP: Token {token_address} has only {len(sorted_trades)} trades (min required: {min_trades}). skipping fetches.")
             return None
         # 3. Fetch other events
+        # --- PRUNING TRANSFERS ---
+        if prune_transfers:
+            transfers = []
+            # print("  INFO: Pruning transfers (skipping fetch).")
+        else:
+            transfers = self.fetch_transfers_for_token(token_address, max_limit_time, 0.0) # 0.0 means fetch all
         pool_creations = self.fetch_pool_creations_for_token(token_address, max_limit_time)
         # Collect pool addresses to fetch liquidity changes

data/data_loader.py CHANGED Viewed

@@ -97,11 +97,11 @@ class OracleDataset(Dataset):
     input sequence for the model.
     """
     def __init__(self,
-                 data_fetcher: DataFetcher, # NEW: Pass the fetcher instance
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
-                 ohlc_stats_path: Union[str, Path] = "./data/ohlc_stats.npz", # NEW: Add stats path parameter
                  token_allowlist: Optional[List[str]] = None,
                  t_cutoff_seconds: int = 60,
                  cache_dir: Optional[Union[str, Path]] = None,
@@ -273,7 +273,8 @@ class OracleDataset(Dataset):
         aggregation_trades: List[Dict[str, Any]],
         wallet_data: Dict[str, Any],
         total_supply_dec: float,
-        _register_event_fn
     ) -> None:
         # Prepare helper sets and maps (static sniper set based on earliest buyers)
         all_buy_trades = sorted([e for e in trade_events if e.get('trade_direction') == 0 and e.get('success', False)], key=lambda x: x['timestamp'])
@@ -304,14 +305,25 @@ class OracleDataset(Dataset):
         buyers_seen_global = set()
         prev_holders_count = 0
-        for ts_value in oc_snapshot_times:
             window_start = ts_value - interval_sec
             trades_win = [e for e in trade_events if e.get('success', False) and window_start < e['timestamp'] <= ts_value]
             xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
             # Per-snapshot holder distribution at ts_value
-            cutoff_dt_ts = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
-            holder_records_ts = self.fetcher.fetch_token_holders_for_snapshot(token_address, cutoff_dt_ts, limit=HOLDER_SNAPSHOT_TOP_K)
             holder_entries_ts = []
             for rec in holder_records_ts:
                 addr = rec.get('wallet_address')
@@ -363,8 +375,7 @@ class OracleDataset(Dataset):
                 buyers_seen_global.add(wa)
             # Compute growth against previous snapshot endpoint.
-            end_dt = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
-            holders_end = self.fetcher.fetch_total_holders_count_for_token(token_address, end_dt)
             total_holders = float(holders_end)
             delta_holders = holders_end - prev_holders_count
             holder_growth_rate = float(delta_holders)
@@ -415,7 +426,7 @@ class OracleDataset(Dataset):
         # Fetch all token details in ONE batch query
         all_deployed_token_details = {}
-        if all_deployed_tokens:
             all_deployed_token_details = self.fetcher.fetch_deployed_token_details(list(all_deployed_tokens), T_cutoff)
         for addr, profile in profiles.items():
@@ -454,18 +465,24 @@ class OracleDataset(Dataset):
             profile['deployed_tokens_avg_peak_mc_usd'] = torch.mean(torch.tensor(peak_mcs)).item() if peak_mcs else 0.0
             profile['deployed_tokens_median_peak_mc_usd'] = torch.median(torch.tensor(peak_mcs)).item() if peak_mcs else 0.0
-    def _process_wallet_data(self, wallet_addresses: List[str], token_data: Dict[str, Any], pooler: EmbeddingPooler, T_cutoff: datetime.datetime) -> tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]:
         """
-        Fetches and processes profile, social, and holdings data for a list of wallets.
-        Uses a T_cutoff to ensure data is point-in-time accurate.
         """
         if not wallet_addresses:
             return {}, token_data
-        print(f"INFO: Processing wallet data for {len(wallet_addresses)} unique wallets...")
-        # Bulk fetch all data
-        profiles, socials = self.fetcher.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
-        holdings = self.fetcher.fetch_wallet_holdings(wallet_addresses, T_cutoff)
         valid_wallets = [addr for addr in wallet_addresses if addr in profiles]
         dropped_wallets = set(wallet_addresses) - set(valid_wallets)
@@ -618,8 +635,11 @@ class OracleDataset(Dataset):
             return {}
         if token_data is None:
-            print(f"INFO: Processing token data for {len(token_addresses)} unique tokens...")
-            token_data = self.fetcher.fetch_token_data(token_addresses, T_cutoff)
         # --- NEW: Print the raw fetched token data as requested ---
         print("\n--- RAW TOKEN DATA FROM DATABASE ---")
@@ -793,14 +813,13 @@ class OracleDataset(Dataset):
             try:
                 raw_data = torch.load(filepath, map_location='cpu', weights_only=False)
             except Exception as e:
-                print(f"ERROR: Could not load cached item {filepath}: {e}")
-                return None
         else:
              # Online mode fallback
              raw_data = self.__cacheitem__(idx)
         if not raw_data:
-            return None
         required_keys = [
             "mint_timestamp",
@@ -822,8 +841,8 @@ class OracleDataset(Dataset):
                 f"Cached sample missing raw fields ({missing_keys}). Rebuild cache with raw caching enabled."
             )
-        if not self.fetcher:
-            raise RuntimeError("Data fetcher required for T_cutoff-dependent data.")
         def _timestamp_to_order_value(ts_value: Any) -> float:
             if isinstance(ts_value, datetime.datetime):
@@ -904,34 +923,53 @@ class OracleDataset(Dataset):
             if _timestamp_to_order_value(liq.get('timestamp')) <= cutoff_ts:
                 _add_wallet(liq.get('lp_provider'), wallets_to_fetch)
-        holder_records = self.fetcher.fetch_token_holders_for_snapshot(
-            token_address,
-            T_cutoff,
-            limit=HOLDER_SNAPSHOT_TOP_K
-        )
         for holder in holder_records:
             _add_wallet(holder.get('wallet_address'), wallets_to_fetch)
         pooler = EmbeddingPooler()
-        main_token_data = self._process_token_data([token_address], pooler, T_cutoff)
         if not main_token_data:
             return None
         wallet_data, all_token_data = self._process_wallet_data(
             list(wallets_to_fetch),
             main_token_data.copy(),
             pooler,
-            T_cutoff
         )
         graph_entities = {}
         graph_links = {}
-        if wallets_to_fetch:
-            graph_entities, graph_links = self.fetcher.fetch_graph_links(
-                list(wallets_to_fetch),
-                T_cutoff,
-                max_degrees=1
-            )
         # Generate the item
         return self._generate_dataset_item(
@@ -960,13 +998,14 @@ class OracleDataset(Dataset):
              graph_seed_entities=wallets_to_fetch,
              all_graph_entities=graph_entities,
              future_trades_for_labels=raw_data['trades'], # We utilize full trade history for labels!
-             pooler=pooler
         )
     def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
         """
         Fetches cutoff-agnostic raw token data for caching/online sampling.
-        Random T_cutoff sampling happens later in __getitem__.
         """
         if not self.sampled_mints:
@@ -984,6 +1023,7 @@ class OracleDataset(Dataset):
         if not self.fetcher:
             raise RuntimeError("Dataset has no data fetcher; cannot load raw data.")
         raw_data = self.fetcher.fetch_raw_token_data(
             token_address=token_address,
             creator_address=creator_address,
@@ -991,7 +1031,10 @@ class OracleDataset(Dataset):
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
-            min_trades=50
         )
         if raw_data is None:
             return None
@@ -1005,56 +1048,134 @@ class OracleDataset(Dataset):
                 return float(ts_value)
             except (TypeError, ValueError):
                 return 0.0
-        trade_ts_values = [
-            _timestamp_to_order_value(trade.get('timestamp'))
-            for trade in raw_data.get('trades', [])
-            if trade.get('timestamp') is not None
-        ]
         if not trade_ts_values:
             print(f"  SKIP: No valid trades found for {token_address}.")
             return None
-        horizons = sorted(self.horizons_seconds)
-        first_horizon = horizons[0] if horizons else 60
-        min_label = max(60, first_horizon)
-        min_window = 30
-        # 2. Strict Duration Check
-        # We enforce the exact same logic as __getitem__ to ensure the sample is usable.
-        # Logic:
-        #   lower_bound = max(min_window, first_trade - mint)
-        #   upper_bound = (last_trade - mint) - required_horizon
-        #   We need upper_bound >= lower_bound.
-        last_trade_ts_val = max(trade_ts_values)
-        first_trade_ts_val = min(trade_ts_values)
-        t0_val = _timestamp_to_order_value(t0)
-        # Calculate offsets relative to mint
-        start_offset = max(0.0, first_trade_ts_val - t0_val)
-        end_offset = max(0.0, last_trade_ts_val - t0_val)
-        lower_bound = max(min_window, int(start_offset))
-        # We use the FIRST horizon to determine minimum validity,
-        # but technically we'd prefer to satisfy at least one horizon.
-        # Using min_label (which is max(60, first_horizon)) is safe.
-        required_horizon = min_label
-        upper_bound = end_offset - required_horizon
-        if upper_bound < lower_bound:
-            # Diagnose the failure reason for the log
-            reason = []
-            if end_offset < (min_window + required_horizon):
-                reason.append(f"total duration {end_offset:.1f}s < {(min_window + required_horizon)}s")
-            if (last_trade_ts_val - first_trade_ts_val) < required_horizon:
-                reason.append(f"trade span {(last_trade_ts_val - first_trade_ts_val):.1f}s < {required_horizon}s")
-            reason_str = ", ".join(reason) or "insufficient window overlap"
-            print(f"  SKIP: {token_address} does not fit sampling window. ({reason_str}) (Trades: {len(trade_ts_values)})")
-            return None
         raw_data["protocol_id"] = initial_mint_record.get("protocol")
         return raw_data
@@ -1078,7 +1199,8 @@ class OracleDataset(Dataset):
             graph_seed_entities: set,
             all_graph_entities: Dict[str, str],
             future_trades_for_labels: List[Dict[str, Any]],
-            pooler: EmbeddingPooler
         ) -> Optional[Dict[str, Any]]:
         """
         Processes raw token data into a structured dataset item for a specific T_cutoff.
@@ -1305,7 +1427,8 @@ class OracleDataset(Dataset):
             aggregation_trades,
             wallet_data,
             total_supply_dec,
-            _register_event
         )
         # 7. Finalize Sequence

     input sequence for the model.
     """
     def __init__(self,
+                 data_fetcher: Optional[DataFetcher] = None, # OPTIONAL: Only needed for caching (Writer)
                  horizons_seconds: List[int] = [],
                  quantiles: List[float] = [],
                  max_samples: Optional[int] = None,
+                 ohlc_stats_path: Union[str, Path] = "./data/ohlc_stats.npz",
                  token_allowlist: Optional[List[str]] = None,
                  t_cutoff_seconds: int = 60,
                  cache_dir: Optional[Union[str, Path]] = None,
         aggregation_trades: List[Dict[str, Any]],
         wallet_data: Dict[str, Any],
         total_supply_dec: float,
+        _register_event_fn,
+        cached_holders_list: List[List[str]] = None
     ) -> None:
         # Prepare helper sets and maps (static sniper set based on earliest buyers)
         all_buy_trades = sorted([e for e in trade_events if e.get('trade_direction') == 0 and e.get('success', False)], key=lambda x: x['timestamp'])
         buyers_seen_global = set()
         prev_holders_count = 0
+        for i, ts_value in enumerate(oc_snapshot_times):
             window_start = ts_value - interval_sec
             trades_win = [e for e in trade_events if e.get('success', False) and window_start < e['timestamp'] <= ts_value]
             xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
             # Per-snapshot holder distribution at ts_value
+            holder_records_ts = []
+            holders_end = 0
+            if cached_holders_list is not None and i < len(cached_holders_list):
+                 # Use cached list of addresses
+                 holder_records_ts = [{'wallet_address': addr, 'current_balance': 0} for addr in cached_holders_list[i]]
+                 holders_end = len(cached_holders_list[i])
+            elif self.fetcher:
+                cutoff_dt_ts = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
+                holder_records_ts = self.fetcher.fetch_token_holders_for_snapshot(token_address, cutoff_dt_ts, limit=HOLDER_SNAPSHOT_TOP_K)
+                holders_end = self.fetcher.fetch_total_holders_count_for_token(token_address, cutoff_dt_ts)
+            else:
+                 holder_records_ts = []
+                 holders_end = 0
             holder_entries_ts = []
             for rec in holder_records_ts:
                 addr = rec.get('wallet_address')
                 buyers_seen_global.add(wa)
             # Compute growth against previous snapshot endpoint.
+            # total_holders = float(holders_end) # already handled above
             total_holders = float(holders_end)
             delta_holders = holders_end - prev_holders_count
             holder_growth_rate = float(delta_holders)
         # Fetch all token details in ONE batch query
         all_deployed_token_details = {}
+        if all_deployed_tokens and self.fetcher:
             all_deployed_token_details = self.fetcher.fetch_deployed_token_details(list(all_deployed_tokens), T_cutoff)
         for addr, profile in profiles.items():
             profile['deployed_tokens_avg_peak_mc_usd'] = torch.mean(torch.tensor(peak_mcs)).item() if peak_mcs else 0.0
             profile['deployed_tokens_median_peak_mc_usd'] = torch.median(torch.tensor(peak_mcs)).item() if peak_mcs else 0.0
+    def _process_wallet_data(self, wallet_addresses: List[str], token_data: Dict[str, Any], pooler: EmbeddingPooler, T_cutoff: datetime.datetime,
+                             profiles_override: Optional[Dict] = None, socials_override: Optional[Dict] = None, holdings_override: Optional[Dict] = None) -> tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]:
         """
+        Fetches or uses cached profile, social, and holdings data.
         """
         if not wallet_addresses:
             return {}, token_data
+        if profiles_override is not None and socials_override is not None:
+             profiles, socials = profiles_override, socials_override
+             holdings = holdings_override if holdings_override is not None else {}
+        else:
+             print(f"INFO: Processing wallet data for {len(wallet_addresses)} unique wallets...")
+             if self.fetcher:
+                 profiles, socials = self.fetcher.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
+                 holdings = self.fetcher.fetch_wallet_holdings(wallet_addresses, T_cutoff)
+             else:
+                 profiles, socials, holdings = {}, {}, {}
         valid_wallets = [addr for addr in wallet_addresses if addr in profiles]
         dropped_wallets = set(wallet_addresses) - set(valid_wallets)
             return {}
         if token_data is None:
+            if self.fetcher:
+                print(f"INFO: Processing token data for {len(token_addresses)} unique tokens...")
+                token_data = self.fetcher.fetch_token_data(token_addresses, T_cutoff)
+            else:
+                 token_data = {}
         # --- NEW: Print the raw fetched token data as requested ---
         print("\n--- RAW TOKEN DATA FROM DATABASE ---")
             try:
                 raw_data = torch.load(filepath, map_location='cpu', weights_only=False)
             except Exception as e:
+                raise RuntimeError(f"ERROR: Could not load cached item {filepath}: {e}")
         else:
              # Online mode fallback
              raw_data = self.__cacheitem__(idx)
         if not raw_data:
+            raise RuntimeError(f"No raw data loaded for index {idx}")
         required_keys = [
             "mint_timestamp",
                 f"Cached sample missing raw fields ({missing_keys}). Rebuild cache with raw caching enabled."
             )
+        # if not self.fetcher:
+        #    raise RuntimeError("Data fetcher required for T_cutoff-dependent data.")
         def _timestamp_to_order_value(ts_value: Any) -> float:
             if isinstance(ts_value, datetime.datetime):
             if _timestamp_to_order_value(liq.get('timestamp')) <= cutoff_ts:
                 _add_wallet(liq.get('lp_provider'), wallets_to_fetch)
+        # Offline Holder Lookup using raw_data['holder_snapshots_list']
+        # We need the snapshot corresponding to T_cutoff.
+        # Intervals are every 300s from mint_ts.
+        # idx = (T_cutoff - mint) // 300
+        elapsed = (T_cutoff - mint_timestamp).total_seconds()
+        snap_idx = int(elapsed // 300)
+        holder_records = []
+        cached_holders_list = raw_data.get('holder_snapshots_list', [])
+        if 0 <= snap_idx < len(cached_holders_list):
+            # Format expected by _add_wallet: dict with 'wallet_address'
+            holder_records = [{'wallet_address': addr} for addr in cached_holders_list[snap_idx]]
         for holder in holder_records:
             _add_wallet(holder.get('wallet_address'), wallets_to_fetch)
         pooler = EmbeddingPooler()
+        # Prepare offline token data
+        offline_token_data = {token_address: raw_data} # Assuming raw_data contains token metadata at root
+        main_token_data = self._process_token_data([token_address], pooler, T_cutoff, token_data=offline_token_data)
         if not main_token_data:
             return None
+        # Prepare offline wallet data
+        # raw_data['socials'] structure: {'profiles': {...}, 'socials': {...}} usually.
+        # But wait, cached raw_data['socials'] might be just the dict we need?
+        # Let's handle graceful empty if not found.
+        cached_social_bundle = raw_data.get('socials', {})
+        offline_profiles = cached_social_bundle.get('profiles', {})
+        offline_socials = cached_social_bundle.get('socials', {})
+        offline_holdings = {} # Holdings not cached usually due to size
         wallet_data, all_token_data = self._process_wallet_data(
             list(wallets_to_fetch),
             main_token_data.copy(),
             pooler,
+            T_cutoff,
+            profiles_override=offline_profiles,
+            socials_override=offline_socials,
+            holdings_override=offline_holdings
         )
         graph_entities = {}
         graph_links = {}
+        graph_entities = {}
+        graph_links = {}
+        # if wallets_to_fetch:
+        #     graph_entities, graph_links = self.fetcher.fetch_graph_links(...)
+        # Offline Graph: check if raw_data has graph? Assuming no for now.
         # Generate the item
         return self._generate_dataset_item(
              graph_seed_entities=wallets_to_fetch,
              all_graph_entities=graph_entities,
              future_trades_for_labels=raw_data['trades'], # We utilize full trade history for labels!
+             pooler=pooler,
+             cached_holders_list=raw_data.get('holder_snapshots_list')
         )
     def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
         """
         Fetches cutoff-agnostic raw token data for caching/online sampling.
+        Generates dense time-series (1s OHLC, Snapshots) and prunes raw logs.
         """
         if not self.sampled_mints:
         if not self.fetcher:
             raise RuntimeError("Dataset has no data fetcher; cannot load raw data.")
+        # --- FETCH FULL HISTORY with PRUNING ---
         raw_data = self.fetcher.fetch_raw_token_data(
             token_address=token_address,
             creator_address=creator_address,
             max_horizon_seconds=self.max_cache_horizon_seconds,
             include_wallet_data=False,
             include_graph=False,
+            min_trades=50,
+            full_history=True,      # Bypass H/B/H limits
+            prune_failed=True,      # Drop failed trades
+            prune_transfers=True    # Drop transfers (captured in snapshots)
         )
         if raw_data is None:
             return None
                 return float(ts_value)
             except (TypeError, ValueError):
                 return 0.0
+        trades = raw_data.get('trades', [])
+        trade_ts_values = [_timestamp_to_order_value(t.get('timestamp')) for t in trades]
         if not trade_ts_values:
             print(f"  SKIP: No valid trades found for {token_address}.")
             return None
+        t0_val = _timestamp_to_order_value(t0)
+        last_trade_ts_val = max(trade_ts_values)
+        # --- GENERATE DENSE 1s OHLC ---
+        duration_seconds = int(last_trade_ts_val - t0_val) + 120 # Add buffer
+        ohlc_1s = torch.zeros((duration_seconds, 2), dtype=torch.float32)
+        # Sort trades by time
+        # raw_data trades are already sorted by fetcher, but let's be safe
+        trades.sort(key=lambda x: _timestamp_to_order_value(x['timestamp']))
+        # Fill OHLC
+        # A faster way: group by second
+        # We can use a simple loop update or numpy accumulation.
+        # Given standard density, simple loop is fine for caching.
+        trades_by_sec = defaultdict(list)
+        for t in trades:
+            ts = _timestamp_to_order_value(t['timestamp'])
+            sec_idx = int(ts - t0_val)
+            if 0 <= sec_idx < duration_seconds:
+                trades_by_sec[sec_idx].append(t['price_usd'])
+        last_close = float(trades[0]['price_usd'])
+        for i in range(duration_seconds):
+            if i in trades_by_sec:
+                prices = trades_by_sec[i]
+                op = prices[0]
+                cl = prices[-1]
+                last_close = cl
+            else:
+                op = cl = last_close
+            ohlc_1s[i, 0] = float(op)
+            ohlc_1s[i, 1] = float(cl)
+        raw_data['ohlc_1s'] = ohlc_1s
+        # --- GENERATE ON-CHAIN SNAPSHOTS (5m Interval) ---
+        interval = 300 # 5 minutes
+        num_intervals = (duration_seconds // interval) + 1
+        # Feature columns: [volume, tx_count, buy_count, sell_count, total_holders, top_10_holder_pct]
+        # We start with basic trade stats. Holder stats require DB queries.
+        snapshot_stats = torch.zeros((num_intervals, 6), dtype=torch.float32)
+        print(f"  INFO: Generating {num_intervals} snapshots (Interval: {interval}s)...")
+        cum_volume = 0.0
+        cum_tx = 0
+        cum_buys = 0
+        cum_sells = 0
+        # Pre-group trades into 5m buckets for windowed volume
+        buckets = defaultdict(list)
+        for t in trades:
+            ts = _timestamp_to_order_value(t['timestamp'])
+            bucket_idx = int(ts - t0_val) // interval
+            if bucket_idx >= 0:
+                buckets[bucket_idx].append(t)
+        # To avoid spamming DB, we might query holders less frequently or batch?
+        # For now, query every step. 288 queries for 24h is fine.
+        fetched_holders_cache = {} # Map bucket_idx -> (count, top10_pct)
+        holder_snapshots_list = [] # List of (timestamp, holders_list)
+        for i in range(num_intervals):
+            bucket_trades = buckets[i]
+            # Windowed Stats
+            vol = sum(t.get('total_usd', 0.0) for t in bucket_trades)
+            tx = len(bucket_trades)
+            buys = sum(1 for t in bucket_trades if t.get('trade_direction') == 0 or t.get('trade_type') == 0) # 0=Buy
+            sells = tx - buys
+            # DB Stats: Holders (Point-in-Time)
+            # Time is end of bucket
+            snapshot_ts = t0 + datetime.timedelta(seconds=(i+1)*interval)
+            # These queries can be slow.
+            count = self.fetcher.fetch_total_holders_count_for_token(token_address, snapshot_ts)
+            # Fetch Top 200 as per constant
+            top_holders = self.fetcher.fetch_token_holders_for_snapshot(token_address, snapshot_ts, limit=HOLDER_SNAPSHOT_TOP_K)
+            total_supply = raw_data.get('total_supply', 0) or 1
+            if raw_data.get('decimals'):
+                total_supply /= (10 ** raw_data['decimals'])
+            top10_bal = sum(h.get('current_balance', 0) for h in top_holders[:10])
+            top10_pct = (top10_bal / total_supply) if total_supply > 0 else 0.0
+            snapshot_stats[i, 0] = float(vol)
+            snapshot_stats[i, 1] = float(tx)
+            snapshot_stats[i, 2] = float(buys)
+            snapshot_stats[i, 3] = float(sells)
+            snapshot_stats[i, 4] = float(count)
+            snapshot_stats[i, 5] = float(top10_pct)
+            # Save the holder identities for the event stream
+            # Make it JSON-serializable-ish (no datetime objects)
+            holder_snapshots_list.append({
+                'timestamp': int(snapshot_ts.timestamp()),
+                'holders': top_holders # [{wallet, balance}, ...]
+            })
+        raw_data['snapshots_5m'] = snapshot_stats
+        raw_data['holder_snapshots_list'] = holder_snapshots_list # Save the list
+        # --- Summary Log ---
+        print(f"  [Cache Summary]")
+        print(f"  - 1s Candles:      {len(ohlc_1s)}")
+        print(f"  - 5m Snapshots:    {len(snapshot_stats)}")
+        print(f"  - Trades (Succ):   {len(trades)}")
+        print(f"  - Pool Events:     {len(raw_data.get('pool_creations', []))}")
+        print(f"  - Liquidity Chgs:  {len(raw_data.get('liquidity_changes', []))}")
+        print(f"  - Burns:           {len(raw_data.get('burns', []))}")
+        print(f"  - Supply Locks:    {len(raw_data.get('supply_locks', []))}")
+        print(f"  - Migrations:      {len(raw_data.get('migrations', []))}")
         raw_data["protocol_id"] = initial_mint_record.get("protocol")
         return raw_data
             graph_seed_entities: set,
             all_graph_entities: Dict[str, str],
             future_trades_for_labels: List[Dict[str, Any]],
+            pooler: EmbeddingPooler,
+            cached_holders_list: List[List[str]] = None
         ) -> Optional[Dict[str, Any]]:
         """
         Processes raw token data into a structured dataset item for a specific T_cutoff.
             aggregation_trades,
             wallet_data,
             total_supply_dec,
+            _register_event,
+            cached_holders_list=cached_holders_list
         )
         # 7. Finalize Sequence

install.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+sudo apt update
+sudo apt install -y curl wget gnupg apt-transport-https ca-certificates dirmngr
+sudo apt update
+sudo apt install -y pkg-config libudev-dev
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+source $HOME/.cargo/env
+# ClickHouse (add repo and install)
+sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 8919F6BD2B48D754
+echo "deb https://packages.clickhouse.com/deb stable main" | sudo tee /etc/apt/sources.list.d/clickhouse.list
+sudo apt update
+sudo apt install -y clickhouse-server clickhouse-client
+# Neo4j (add repo and install)
+sudo wget -O - https://debian.neo4j.com/neotechnology.gpg.key | sudo gpg --dearmor -o /usr/share/keyrings/neo4j.gpg
+echo "deb [signed-by=/usr/share/keyrings/neo4j.gpg] https://debian.neo4j.com stable latest" | sudo tee -a /etc/apt/sources.list.d/neo4j.list
+sudo apt update
+sudo apt install -y neo4j
+# Start Neo4j (Runs on bolt://localhost:7687)
+sudo neo4j-admin dbms set-initial-password neo4j123
+neo4j start
+clickhouse-server

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fb6fc43f8ae6467768fb090cfdda9ef48e68d361874317db93e5eee126539989
-size 143685

 version https://git-lfs.github.com/spec/v1
+oid sha256:2bfaace3cf2aadc0acf9e9714d8df00c44bc545db23c87e7497a7844ba3c98a9
+size 6115919

models/model.py CHANGED Viewed

@@ -5,6 +5,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import AutoConfig, AutoModel
 from typing import List, Dict, Any, Optional, Tuple
 # --- NOW, we import all the encoders ---
 from models.helper_encoders import ContextualTimeEncoder
@@ -43,6 +45,9 @@ class Oracle(nn.Module):
         self.multi_modal_dim = multi_modal_dim
         self.quantiles = quantiles
         self.horizons_seconds = horizons_seconds
         self.num_outputs = len(quantiles) * len(horizons_seconds)
@@ -225,6 +230,77 @@ class Oracle(nn.Module):
         self.to(dtype)
         print("Oracle model (full pipeline) initialized.")
     def _normalize_and_project(self,
                                features: torch.Tensor,
                                norm_layer: nn.LayerNorm,

 import torch.nn.functional as F
 from transformers import AutoConfig, AutoModel
 from typing import List, Dict, Any, Optional, Tuple
+import os
+import json
 # --- NOW, we import all the encoders ---
 from models.helper_encoders import ContextualTimeEncoder
         self.multi_modal_dim = multi_modal_dim
+        self.num_event_types = num_event_types
+        self.event_pad_id = event_pad_id
+        self.model_config_name = model_config_name
         self.quantiles = quantiles
         self.horizons_seconds = horizons_seconds
         self.num_outputs = len(quantiles) * len(horizons_seconds)
         self.to(dtype)
         print("Oracle model (full pipeline) initialized.")
+    def save_pretrained(self, save_directory: str):
+        """
+        Saves the model in a Hugging Face-compatible way.
+        """
+        if not os.path.exists(save_directory):
+            os.makedirs(save_directory)
+        # 1. Save the inner transformer model using its own save_pretrained
+        # This gives us the standard HF config.json and pytorch_model.bin for the backbone
+        self.model.save_pretrained(save_directory)
+        # 2. Save the whole Oracle state dict (includes transformer + all custom encoders)
+        # We use 'oracle_model.bin' for the full state.
+        torch.save(self.state_dict(), os.path.join(save_directory, "pytorch_model.bin"))
+        # 3. Save Oracle specific metadata for reconstruction
+        oracle_config = {
+            "num_event_types": self.num_event_types,
+            "multi_modal_dim": self.multi_modal_dim,
+            "event_pad_id": self.event_pad_id,
+            "model_config_name": self.model_config_name,
+            "quantiles": self.quantiles,
+            "horizons_seconds": self.horizons_seconds,
+            "dtype": str(self.dtype),
+            "event_type_to_id": self.event_type_to_id
+        }
+        with open(os.path.join(save_directory, "oracle_config.json"), "w") as f:
+            json.dump(oracle_config, f, indent=2)
+        print(f"✅ Oracle model saved to {save_directory}")
+    @classmethod
+    def from_pretrained(cls, load_directory: str,
+                        token_encoder, wallet_encoder, graph_updater, ohlc_embedder, time_encoder):
+        """
+        Loads the Oracle model from a saved directory.
+        Note: You must still provide the initialized sub-encoders (or we can refactor to save them too).
+        """
+        config_path = os.path.join(load_directory, "oracle_config.json")
+        with open(config_path, "r") as f:
+            config = json.load(f)
+        # Determine dtype from string
+        dtype = torch.bfloat16 # Default
+        if "float32" in config["dtype"]: dtype = torch.float32
+        elif "float16" in config["dtype"]: dtype = torch.float16
+        # Instantiate model
+        model = cls(
+            token_encoder=token_encoder,
+            wallet_encoder=wallet_encoder,
+            graph_updater=graph_updater,
+            ohlc_embedder=ohlc_embedder,
+            time_encoder=time_encoder,
+            num_event_types=config["num_event_types"],
+            multi_modal_dim=config["multi_modal_dim"],
+            event_pad_id=config["event_pad_id"],
+            event_type_to_id=config["event_type_to_id"],
+            model_config_name=config["model_config_name"],
+            quantiles=config["quantiles"],
+            horizons_seconds=config["horizons_seconds"],
+            dtype=dtype
+        )
+        # Load weights
+        weight_path = os.path.join(load_directory, "pytorch_model.bin")
+        state_dict = torch.load(weight_path, map_location="cpu")
+        model.load_state_dict(state_dict)
+        print(f"✅ Oracle model loaded from {load_directory}")
+        return model
     def _normalize_and_project(self,
                                features: torch.Tensor,
                                norm_layer: nn.LayerNorm,

models/multi_modal_processor.py CHANGED Viewed

@@ -21,9 +21,12 @@ class MultiModalEncoder:
     This class is intended for creating embeddings for vector search.
     """
-    def __init__(self, model_id="google/siglip-so400m-patch16-256-i18n", dtype: torch.dtype = torch.bfloat16):
         self.model_id = model_id
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.dtype = dtype

     This class is intended for creating embeddings for vector search.
     """
+    def __init__(self, model_id="google/siglip-so400m-patch16-256-i18n", dtype: torch.dtype = torch.bfloat16, device: str = None):
         self.model_id = model_id
+        if device:
+            self.device = device
+        else:
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.dtype = dtype

models/ohlc_embedder.py CHANGED Viewed

@@ -18,7 +18,7 @@ class OHLCEmbedder(nn.Module):
         # --- NEW: Interval vocab size ---
         num_intervals: int,
         input_channels: int = 2, # Open, Close
-        sequence_length: int = 300,
         cnn_channels: List[int] = [16, 32, 64],
         kernel_sizes: List[int] = [3, 3, 3],
         # --- NEW: Interval embedding dim ---
@@ -30,12 +30,12 @@ class OHLCEmbedder(nn.Module):
         assert len(cnn_channels) == len(kernel_sizes), "cnn_channels and kernel_sizes must have the same length"
         self.dtype = dtype
-        self.sequence_length = sequence_length
         self.cnn_layers = nn.ModuleList()
         self.output_dim = output_dim
         in_channels = input_channels
-        current_seq_len = sequence_length
         for i, (out_channels, k_size) in enumerate(zip(cnn_channels, kernel_sizes)):
             conv = nn.Conv1d(

         # --- NEW: Interval vocab size ---
         num_intervals: int,
         input_channels: int = 2, # Open, Close
+        # sequence_length: int = 300, # REMOVED: HARDCODED
         cnn_channels: List[int] = [16, 32, 64],
         kernel_sizes: List[int] = [3, 3, 3],
         # --- NEW: Interval embedding dim ---
         assert len(cnn_channels) == len(kernel_sizes), "cnn_channels and kernel_sizes must have the same length"
         self.dtype = dtype
+        self.sequence_length = 300 # HARDCODED
         self.cnn_layers = nn.ModuleList()
         self.output_dim = output_dim
         in_channels = input_channels
+        current_seq_len = 300
         for i, (out_channels, k_size) in enumerate(zip(cnn_channels, kernel_sizes)):
             conv = nn.Conv1d(

train.py CHANGED Viewed

@@ -4,6 +4,9 @@ import math
 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 # Ensure torch/dill have a writable tmp dir
 _DEFAULT_TMP = Path(os.getenv("TMPDIR_OVERRIDE", "./.tmp"))
@@ -12,6 +15,11 @@ resolved_tmp = str(_DEFAULT_TMP.resolve())
 for key in ("TMPDIR", "TMP", "TEMP"):
     os.environ.setdefault(key, resolved_tmp)
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
@@ -126,7 +134,6 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--checkpoint_dir", type=str, default="checkpoints")
     parser.add_argument("--mixed_precision", type=str, default="bf16")
     parser.add_argument("--max_seq_len", type=int, default=16000)
-    parser.add_argument("--ohlc_seq_len", type=int, default=60)
     parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
     parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
     parser.add_argument("--max_samples", type=int, default=None)
@@ -200,8 +207,8 @@ def main() -> None:
     horizons = args.horizons_seconds
     quantiles = args.quantiles
     max_seq_len = args.max_seq_len
-    ohlc_seq_len = args.ohlc_seq_len
     logger.info(f"Initializing Encoders with dtype={init_dtype}...")
     # Encoders
@@ -212,39 +219,29 @@ def main() -> None:
     graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=init_dtype)
     ohlc_embedder = OHLCEmbedder(
         num_intervals=vocab.NUM_OHLC_INTERVALS,
-        sequence_length=ohlc_seq_len,
         dtype=init_dtype
     )
     collator = MemecoinCollator(
         event_type_to_id=vocab.EVENT_TO_ID,
         device=device, # Note: Collator will handle basic moves, Accelerate handles the rest
-        multi_modal_encoder=multi_modal_encoder,
         dtype=init_dtype,
-        ohlc_seq_len=ohlc_seq_len,
         max_seq_len=max_seq_len
     )
-    # DB Connections
-    clickhouse_client = ClickHouseClient(
-        host=args.clickhouse_host,
-        port=int(args.clickhouse_port)
-    )
-    neo4j_auth = ("neo4j", "neo4j123")
-    if args.neo4j_user is not None:
-        neo4j_auth = (args.neo4j_user, args.neo4j_password or "")
-    neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=neo4j_auth)
-    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
     dataset = OracleDataset(
-        data_fetcher=data_fetcher,
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=args.max_samples,
         ohlc_stats_path=args.ohlc_stats_path,
-        t_cutoff_seconds=int(args.t_cutoff_seconds),
         cache_dir="/workspace/apollo/data/cache"
     )
@@ -257,7 +254,7 @@ def main() -> None:
         shuffle=bool(args.shuffle),
         num_workers=int(args.num_workers),
         pin_memory=bool(args.pin_memory),
-        collate_fn=lambda batch: filtered_collate(collator, batch)
     )
     # --- 3. Model Init ---
@@ -442,25 +439,36 @@ def main() -> None:
                     if accelerator.is_main_process:
                         save_path = checkpoint_dir / f"checkpoint-{total_steps}"
                         accelerator.save_state(output_dir=str(save_path))
-                        logger.info(f"Saved checkpoint to {save_path}")
         # End of Epoch Handling
         if valid_batches > 0:
             avg_loss = epoch_loss / valid_batches
             if accelerator.is_main_process:
                 logger.info(f"Epoch {epoch+1} complete. Avg loss: {avg_loss:.6f}")
-                accelerator.log({"train/loss_epoch": avg_loss}, step=global_step)
-                # Save Checkpoint at end of epoch
                 save_path = checkpoint_dir / f"epoch_{epoch+1}"
-                accelerator.save_state(output_dir=str(save_path))
-                logger.info(f"Saved checkpoint to {save_path}")
         else:
             if accelerator.is_main_process:
                 logger.warning(f"Epoch {epoch+1}: No valid batches processed.")
     accelerator.end_training()
-    neo4j_driver.close()
 if __name__ == "__main__":
     main()

 import logging
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
+import functools
+import torch.multiprocessing as mp
 # Ensure torch/dill have a writable tmp dir
 _DEFAULT_TMP = Path(os.getenv("TMPDIR_OVERRIDE", "./.tmp"))
 for key in ("TMPDIR", "TMP", "TEMP"):
     os.environ.setdefault(key, resolved_tmp)
+try:
+    mp.set_start_method('spawn', force=True)
+except RuntimeError:
+    pass
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
     parser.add_argument("--checkpoint_dir", type=str, default="checkpoints")
     parser.add_argument("--mixed_precision", type=str, default="bf16")
     parser.add_argument("--max_seq_len", type=int, default=16000)
     parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
     parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
     parser.add_argument("--max_samples", type=int, default=None)
     horizons = args.horizons_seconds
     quantiles = args.quantiles
     max_seq_len = args.max_seq_len
+    max_seq_len = args.max_seq_len
     logger.info(f"Initializing Encoders with dtype={init_dtype}...")
     # Encoders
     graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=init_dtype)
     ohlc_embedder = OHLCEmbedder(
         num_intervals=vocab.NUM_OHLC_INTERVALS,
         dtype=init_dtype
     )
     collator = MemecoinCollator(
         event_type_to_id=vocab.EVENT_TO_ID,
         device=device, # Note: Collator will handle basic moves, Accelerate handles the rest
+        # multi_modal_encoder=multi_modal_encoder, # REMOVED: Uses lazy loading internally
         dtype=init_dtype,
         max_seq_len=max_seq_len
     )
+    # DB Connections - REMOVED for Training (Using Cache)
+    # clickhouse_client = ClickHouseClient(...)
+    # neo4j_driver = GraphDatabase.driver(...)
+    # data_fetcher = DataFetcher(...)
     dataset = OracleDataset(
+        data_fetcher=None, # Training Mode (Reader Only)
         horizons_seconds=horizons,
         quantiles=quantiles,
         max_samples=args.max_samples,
         ohlc_stats_path=args.ohlc_stats_path,
+        t_cutoff_seconds=int(args.t_cutoff_seconds) if hasattr(args, 't_cutoff_seconds') else 60,
         cache_dir="/workspace/apollo/data/cache"
     )
         shuffle=bool(args.shuffle),
         num_workers=int(args.num_workers),
         pin_memory=bool(args.pin_memory),
+        collate_fn=functools.partial(filtered_collate, collator)
     )
     # --- 3. Model Init ---
                     if accelerator.is_main_process:
                         save_path = checkpoint_dir / f"checkpoint-{total_steps}"
                         accelerator.save_state(output_dir=str(save_path))
+                        # NEW: Save in standard HF-loadable way
+                        hf_save_path = save_path / "hf_model"
+                        unwrapped_model = accelerator.unwrap_model(model)
+                        unwrapped_model.save_pretrained(str(hf_save_path))
+                        logger.info(f"Saved checkpoint and HF-style model to {save_path}")
         # End of Epoch Handling
         if valid_batches > 0:
             avg_loss = epoch_loss / valid_batches
             if accelerator.is_main_process:
                 logger.info(f"Epoch {epoch+1} complete. Avg loss: {avg_loss:.6f}")
+                accelerator.log({"train/loss_epoch": avg_loss}, step=total_steps)
+                # Save Checkpoint at end of epoch (REMOVED: saving every epoch is too much)
                 save_path = checkpoint_dir / f"epoch_{epoch+1}"
+                # accelerator.save_state(output_dir=str(save_path))
+                # hf_save_path = save_path / "hf_model"
+                # unwrapped_model = accelerator.unwrap_model(model)
+                # unwrapped_model.save_pretrained(str(hf_save_path))
+                # logger.info(f"Saved and HF-style model (EOF) to {save_path}")
+                pass
         else:
             if accelerator.is_main_process:
                 logger.warning(f"Epoch {epoch+1}: No valid batches processed.")
     accelerator.end_training()
+    # neo4j_driver.close() # REMOVED
 if __name__ == "__main__":
     main()

train.sh CHANGED Viewed

@@ -1,4 +1,4 @@
-accelerate launch train.py \
   --epochs 10 \
   --batch_size 1 \
   --learning_rate 1e-4 \
@@ -7,16 +7,14 @@ accelerate launch train.py \
   --max_grad_norm 1.0 \
   --seed 42 \
   --log_every 1 \
-  --save_every 1000 \
   --tensorboard_dir runs/oracle \
   --checkpoint_dir checkpoints \
   --mixed_precision bf16 \
-  --max_seq_len 50 \
-  --ohlc_seq_len 300 \
   --horizons_seconds 30 60 120 240 420 \
   --quantiles 0.1 0.5 0.9 \
   --ohlc_stats_path ./data/ohlc_stats.npz \
-  --t_cutoff_seconds 60 \
   --num_workers 4 \
   --clickhouse_host localhost \
   --clickhouse_port 9000 \

+/venv/main/bin/accelerate launch train.py \
   --epochs 10 \
   --batch_size 1 \
   --learning_rate 1e-4 \
   --max_grad_norm 1.0 \
   --seed 42 \
   --log_every 1 \
+  --save_every 10 \
   --tensorboard_dir runs/oracle \
   --checkpoint_dir checkpoints \
   --mixed_precision bf16 \
+  --max_seq_len 4096 \
   --horizons_seconds 30 60 120 240 420 \
   --quantiles 0.1 0.5 0.9 \
   --ohlc_stats_path ./data/ohlc_stats.npz \
   --num_workers 4 \
   --clickhouse_host localhost \
   --clickhouse_port 9000 \