Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

data/data_collator.py +3 -19
data/data_loader.py +23 -62
data/ohlc_stats.npz +2 -2
log.log +2 -2
models/graph_updater.py +5 -0
models/helper_encoders.py +5 -0
models/model.py +0 -8
models/multi_modal_processor.py +11 -27
models/ohlc_embedder.py +5 -0
models/token_encoder.py +5 -18
models/wallet_encoder.py +22 -0
scripts/cache_dataset.py +86 -0
train.py +72 -2

data/data_collator.py CHANGED Viewed

@@ -282,23 +282,13 @@ class MemecoinCollator:
         wallet_addr_to_batch_idx = {feat.get('profile', {}).get('wallet_address', f'__error_{i}'): i+1 for i, feat in enumerate(wallet_list_data)}
         token_addr_to_batch_idx = {feat.get('address', f'__error_{i}'): i+1 for i, feat in enumerate(token_list_data)}
         # Collate Static Raw Features (Tokens, Wallets, Graph)
         token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
         wallet_encoder_inputs = self._collate_features_for_encoder(wallet_list_data, ['profile'], self.device, "wallet")
         graph_updater_links = self._collate_graph_links(batch, wallet_addr_to_batch_idx, token_addr_to_batch_idx)
-        # --- Logging ---
-        pool_contents = batch_wide_pooler.get_all_items()
-        print(f"\n[DataCollator: Final Embedding Pool] ({len(pool_contents)} items):")
-        if pool_contents:
-            for item_data in pool_contents:
-                sample_item = item_data['item']
-                sample_type = "Image" if isinstance(sample_item, Image.Image) else "Text"
-                content_preview = str(sample_item)
-                if sample_type == "Text" and len(content_preview) > 100:
-                    content_preview = content_preview[:97] + "..."
-                print(f"  - Item (Original Idx {item_data['idx']}): Type='{sample_type}', Content='{content_preview}'")
         # --- 5. Prepare Sequence Tensors & Collect Dynamic Data (OHLC) ---
         B = batch_size
         L = max_len
@@ -417,13 +407,7 @@ class MemecoinCollator:
         # Loop through sequences to populate tensors and collect chart events
         for i, seq in enumerate(all_event_sequences):
-            # --- LOGGING CONTEXT (First item only) ---
-            if i == 0:
-                context_names = [e.get('event_type', 'Unknown') for e in seq]
-                print("\n[DataCollator] Context Preview (Event Sequence Names):")
-                print(context_names)
-                print(f"[DataCollator] Sequence Length: {len(context_names)}\n")
             seq_len = len(seq)
             if seq_len == 0: continue
             attention_mask[i, :seq_len] = 1

         wallet_addr_to_batch_idx = {feat.get('profile', {}).get('wallet_address', f'__error_{i}'): i+1 for i, feat in enumerate(wallet_list_data)}
         token_addr_to_batch_idx = {feat.get('address', f'__error_{i}'): i+1 for i, feat in enumerate(token_list_data)}
+        # Collate Static Raw Features (Tokens, Wallets, Graph)
+        token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
         # Collate Static Raw Features (Tokens, Wallets, Graph)
         token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
         wallet_encoder_inputs = self._collate_features_for_encoder(wallet_list_data, ['profile'], self.device, "wallet")
         graph_updater_links = self._collate_graph_links(batch, wallet_addr_to_batch_idx, token_addr_to_batch_idx)
         # --- 5. Prepare Sequence Tensors & Collect Dynamic Data (OHLC) ---
         B = batch_size
         L = max_len
         # Loop through sequences to populate tensors and collect chart events
         for i, seq in enumerate(all_event_sequences):
             seq_len = len(seq)
             if seq_len == 0: continue
             attention_mask[i, :seq_len] = 1

data/data_loader.py CHANGED Viewed

@@ -273,7 +273,6 @@ class OracleDataset(Dataset):
         ts_list = [int(entry[0]) for entry in price_series]
         price_list = [float(entry[1]) for entry in price_series]
-        print(f"[DEBUG-TRACE-LABELS] ts_list len: {len(ts_list)}, price_list len: {len(price_list)}")
         if not ts_list:
             return torch.zeros(self.num_outputs), torch.zeros(self.num_outputs), []
@@ -531,7 +530,6 @@ class OracleDataset(Dataset):
              profiles, socials = profiles_override, socials_override
              holdings = holdings_override if holdings_override is not None else {}
         else:
-             print(f"INFO: Processing wallet data for {len(wallet_addresses)} unique wallets...")
              if self.fetcher:
                  profiles, socials = self.fetcher.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
                  holdings = self.fetcher.fetch_wallet_holdings(wallet_addresses, T_cutoff)
@@ -539,40 +537,29 @@ class OracleDataset(Dataset):
                  profiles, socials, holdings = {}, {}, {}
         valid_wallets = [addr for addr in wallet_addresses if addr in profiles]
-        dropped_wallets = set(wallet_addresses) - set(valid_wallets)
-        if dropped_wallets:
-            print(f"INFO: Skipping {len(dropped_wallets)} wallets with no profile before cutoff.")
         if not valid_wallets:
-            print("INFO: All wallets were graph-only or appeared after cutoff; skipping wallet processing for this token.")
             return {}, token_data
         wallet_addresses = valid_wallets
-        # --- NEW: Collect all unique mints from holdings to fetch their data ---
         all_holding_mints = set()
         for wallet_addr in wallet_addresses:
             for holding_item in holdings.get(wallet_addr, []):
                 if 'mint_address' in holding_item:
                     all_holding_mints.add(holding_item['mint_address'])
-        # --- NEW: Process all discovered tokens with point-in-time logic ---
-        # 1. Fetch raw data for all newly found tokens from holdings.
-        # 2. Process this raw data to get embedding indices and add to the pooler.
-        #    Note: _process_token_data is designed to take a list and return a dict.
-        #    We pass the addresses and let it handle the fetching and processing internally.
         processed_new_tokens = self._process_token_data(list(all_holding_mints), pooler, T_cutoff)
-        # 3. Merge the fully processed new tokens with the existing main token data.
         all_token_data = {**token_data, **(processed_new_tokens or {})}
-        # --- NEW: Calculate deployed token stats using point-in-time logic ---
         self._calculate_deployed_token_stats(profiles, T_cutoff)
         # --- Assemble the final wallet dictionary ---
-        # This structure is exactly what the WalletEncoder expects.
         final_wallets = {}
         for addr in wallet_addresses:
             # --- Define all expected numerical keys for a profile ---
-            # This prevents KeyErrors if the DB returns a partial profile.
             expected_profile_keys = [
                 'age', 'deployed_tokens_count', 'deployed_tokens_migrated_pct',
                 'deployed_tokens_avg_lifetime_sec', 'deployed_tokens_avg_peak_mc_usd',
@@ -585,54 +572,39 @@ class OracleDataset(Dataset):
                 'stats_1d_total_fee', 'stats_1d_winrate', 'stats_1d_tokens_traded',
                 'stats_7d_realized_profit_sol', 'stats_7d_realized_profit_pnl', 'stats_7d_buy_count', 'stats_7d_sell_count', 'stats_7d_transfer_in_count', 'stats_7d_transfer_out_count', 'stats_7d_avg_holding_period', 'stats_7d_total_bought_cost_sol', 'stats_7d_total_sold_income_sol', 'stats_7d_total_fee', 'stats_7d_winrate', 'stats_7d_tokens_traded'
             ]
-            # --- FIXED: Use .get() and provide a default empty dict if not found ---
-            # --- NEW: If a wallet profile doesn't exist in the DB, skip it entirely. ---
-            # This removes the old logic that created a placeholder profile with zeroed-out features.
-            # "If it doesn't exist, it doesn't exist."
             profile_data = profiles.get(addr, None)
             if not profile_data:
-                print(f"INFO: Wallet {addr} found in graph but has no profile in DB. Skipping this wallet.")
                 continue
-            # --- NEW: Ensure all expected keys exist in the fetched profile ---
             for key in expected_profile_keys:
-                profile_data.setdefault(key, 0.0) # Use 0.0 as a safe default for any missing numerical key
             social_data = socials.get(addr, {})
-            # --- NEW: Derive boolean social flags based on schema ---
             social_data['has_pf_profile'] = bool(social_data.get('pumpfun_username'))
             social_data['has_twitter'] = bool(social_data.get('twitter_username'))
             social_data['has_telegram'] = bool(social_data.get('telegram_channel'))
-            # 'is_exchange_wallet' is not in the schema, so we'll default to False for now.
-            # This is a feature that would likely come from a 'tags' column or a separate service.
             social_data['is_exchange_wallet'] = 'exchange_wallet' in profile_data.get('tags', [])
-            # --- NEW: Calculate 'age' based on user's logic ---
             funded_ts = profile_data.get('funded_timestamp', 0)
             if funded_ts and funded_ts > 0:
-                # Calculate age in seconds from the funding timestamp
                 age_seconds = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) - funded_ts
             else:
-                # Fallback for wallets older than our DB window, as requested
-                # 5 months * 30 days/month * 24 hours/day * 3600 seconds/hour
                 age_seconds = 12_960_000
-            # Add the calculated age to the profile data that the WalletEncoder will receive
             profile_data['age'] = float(age_seconds)
-            # Get the username and add it to the embedding pooler
             username = social_data.get('pumpfun_username') or social_data.get('twitter_username') or social_data.get('kolscan_name')
             if isinstance(username, str) and username.strip():
                 social_data['username_emb_idx'] = pooler.get_idx(username.strip())
             else:
-                social_data['username_emb_idx'] = 0  # means "no embedding"
-            # --- NEW: Filter holdings and calculate derived features ---
-            # We create a new list `valid_wallet_holdings` to ensure that if a holding's
-            # token is invalid (filtered out by _process_token_data), the entire holding
-            # row is removed and not passed to the WalletEncoder.
             original_holdings = holdings.get(addr, [])
             valid_wallet_holdings = []
             now_ts = datetime.datetime.now(datetime.timezone.utc)
@@ -643,7 +615,6 @@ class OracleDataset(Dataset):
                 token_info = all_token_data.get(mint_addr)
                 if not token_info:
-                    print(f"INFO: Skipping holding for token {mint_addr} in wallet {addr} because token data is invalid/missing.")
                     continue
                 end_ts = holding_item.get('end_holding_at')
@@ -662,10 +633,9 @@ class OracleDataset(Dataset):
                     holding_item['balance_pct_to_supply'] = 0.0
                 # 3. --- NEW: Calculate bought_amount_sol_pct_to_native_balance ---
-                # This uses the historically accurate native balance from the profile.
                 wallet_native_balance = profile_data.get('balance', 0.0)
                 bought_cost_sol = holding_item.get('history_bought_cost_sol', 0.0)
-                if wallet_native_balance > 1e-9: # Use a small epsilon to avoid division by zero
                     holding_item['bought_amount_sol_pct_to_native_balance'] = bought_cost_sol / wallet_native_balance
                 else:
                     holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
@@ -695,9 +665,7 @@ class OracleDataset(Dataset):
             else:
                  token_data = {}
-        # --- NEW: Print the raw fetched token data as requested ---
-        print("\n--- RAW TOKEN DATA FROM DATABASE ---")
-        print(token_data)
         # Add pre-computed embedding indices to the token data
         # --- CRITICAL FIX: This function now returns None if the main token is invalid ---
@@ -836,14 +804,8 @@ class OracleDataset(Dataset):
         full_ohlc = []
         start_ts = sorted_intervals[0]
         end_ts = int(T_cutoff.timestamp())
-        # Align end_ts to the interval grid
-        end_ts = (end_ts // interval_seconds) * interval_seconds
         last_price = aggregation_trades[0]['price_usd']
-        # --- NEW: Debugging log for trades grouped by interval ---
-        print(f"\n[DEBUG] OHLC Generation: Trades grouped by interval bucket:")
-        print(dict(trades_by_interval))
         for ts in range(start_ts, end_ts + 1, interval_seconds):
             if ts in trades_by_interval:
                 prices = trades_by_interval[ts]
@@ -940,12 +902,21 @@ class OracleDataset(Dataset):
         # If somehow we have fewer than 25 trades (cache mismatch?), fallback to last.
         safe_idx = min(24, len(sorted_trades_ts) - 1)
         min_cutoff_ts = sorted_trades_ts[safe_idx]
-        max_cutoff_ts = sorted_trades_ts[-1]
         if max_cutoff_ts <= min_cutoff_ts:
              sample_offset_ts = min_cutoff_ts
         else:
-             # Standard case: sample uniformly between [Trade[24], LastTrade]
              sample_offset_ts = random.uniform(min_cutoff_ts, max_cutoff_ts)
         T_cutoff = datetime.datetime.fromtimestamp(sample_offset_ts, tz=datetime.timezone.utc)
@@ -1221,17 +1192,7 @@ class OracleDataset(Dataset):
         raw_data['snapshots_5m'] = snapshot_stats
         raw_data['holder_snapshots_list'] = holder_snapshots_list # Save the list
-        # --- Summary Log ---
-        print(f"  [Cache Summary]")
-        print(f"  - 1s Candles:      {len(ohlc_1s)}")
-        print(f"  - 5m Snapshots:    {len(snapshot_stats)}")
-        print(f"  - Trades (Succ):   {len(trades)}")
-        print(f"  - Pool Events:     {len(raw_data.get('pool_creations', []))}")
-        print(f"  - Liquidity Chgs:  {len(raw_data.get('liquidity_changes', []))}")
-        print(f"  - Burns:           {len(raw_data.get('burns', []))}")
-        print(f"  - Supply Locks:    {len(raw_data.get('supply_locks', []))}")
-        print(f"  - Migrations:      {len(raw_data.get('migrations', []))}")
         raw_data["protocol_id"] = initial_mint_record.get("protocol")
         return raw_data

         ts_list = [int(entry[0]) for entry in price_series]
         price_list = [float(entry[1]) for entry in price_series]
         if not ts_list:
             return torch.zeros(self.num_outputs), torch.zeros(self.num_outputs), []
              profiles, socials = profiles_override, socials_override
              holdings = holdings_override if holdings_override is not None else {}
         else:
              if self.fetcher:
                  profiles, socials = self.fetcher.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
                  holdings = self.fetcher.fetch_wallet_holdings(wallet_addresses, T_cutoff)
                  profiles, socials, holdings = {}, {}, {}
         valid_wallets = [addr for addr in wallet_addresses if addr in profiles]
         if not valid_wallets:
             return {}, token_data
         wallet_addresses = valid_wallets
+        # --- Collect all unique mints from holdings to fetch their data ---
         all_holding_mints = set()
         for wallet_addr in wallet_addresses:
             for holding_item in holdings.get(wallet_addr, []):
                 if 'mint_address' in holding_item:
                     all_holding_mints.add(holding_item['mint_address'])
+        # --- Process all discovered tokens with point-in-time logic ---
         processed_new_tokens = self._process_token_data(list(all_holding_mints), pooler, T_cutoff)
         all_token_data = {**token_data, **(processed_new_tokens or {})}
+        # --- Calculate deployed token stats using point-in-time logic ---
         self._calculate_deployed_token_stats(profiles, T_cutoff)
         # --- Assemble the final wallet dictionary ---
         final_wallets = {}
         for addr in wallet_addresses:
             # --- Define all expected numerical keys for a profile ---
             expected_profile_keys = [
                 'age', 'deployed_tokens_count', 'deployed_tokens_migrated_pct',
                 'deployed_tokens_avg_lifetime_sec', 'deployed_tokens_avg_peak_mc_usd',
                 'stats_1d_total_fee', 'stats_1d_winrate', 'stats_1d_tokens_traded',
                 'stats_7d_realized_profit_sol', 'stats_7d_realized_profit_pnl', 'stats_7d_buy_count', 'stats_7d_sell_count', 'stats_7d_transfer_in_count', 'stats_7d_transfer_out_count', 'stats_7d_avg_holding_period', 'stats_7d_total_bought_cost_sol', 'stats_7d_total_sold_income_sol', 'stats_7d_total_fee', 'stats_7d_winrate', 'stats_7d_tokens_traded'
             ]
             profile_data = profiles.get(addr, None)
             if not profile_data:
                 continue
             for key in expected_profile_keys:
+                profile_data.setdefault(key, 0.0)
             social_data = socials.get(addr, {})
+            # --- Derive boolean social flags based on schema ---
             social_data['has_pf_profile'] = bool(social_data.get('pumpfun_username'))
             social_data['has_twitter'] = bool(social_data.get('twitter_username'))
             social_data['has_telegram'] = bool(social_data.get('telegram_channel'))
             social_data['is_exchange_wallet'] = 'exchange_wallet' in profile_data.get('tags', [])
+            # --- Calculate 'age' based on user's logic ---
             funded_ts = profile_data.get('funded_timestamp', 0)
             if funded_ts and funded_ts > 0:
                 age_seconds = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) - funded_ts
             else:
                 age_seconds = 12_960_000
             profile_data['age'] = float(age_seconds)
             username = social_data.get('pumpfun_username') or social_data.get('twitter_username') or social_data.get('kolscan_name')
             if isinstance(username, str) and username.strip():
                 social_data['username_emb_idx'] = pooler.get_idx(username.strip())
             else:
+                social_data['username_emb_idx'] = 0
+            # --- Filter holdings and calculate derived features ---
             original_holdings = holdings.get(addr, [])
             valid_wallet_holdings = []
             now_ts = datetime.datetime.now(datetime.timezone.utc)
                 token_info = all_token_data.get(mint_addr)
                 if not token_info:
                     continue
                 end_ts = holding_item.get('end_holding_at')
                     holding_item['balance_pct_to_supply'] = 0.0
                 # 3. --- NEW: Calculate bought_amount_sol_pct_to_native_balance ---
                 wallet_native_balance = profile_data.get('balance', 0.0)
                 bought_cost_sol = holding_item.get('history_bought_cost_sol', 0.0)
+                if wallet_native_balance > 1e-9:
                     holding_item['bought_amount_sol_pct_to_native_balance'] = bought_cost_sol / wallet_native_balance
                 else:
                     holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
             else:
                  token_data = {}
         # Add pre-computed embedding indices to the token data
         # --- CRITICAL FIX: This function now returns None if the main token is invalid ---
         full_ohlc = []
         start_ts = sorted_intervals[0]
         end_ts = int(T_cutoff.timestamp())
         last_price = aggregation_trades[0]['price_usd']
         for ts in range(start_ts, end_ts + 1, interval_seconds):
             if ts in trades_by_interval:
                 prices = trades_by_interval[ts]
         # If somehow we have fewer than 25 trades (cache mismatch?), fallback to last.
         safe_idx = min(24, len(sorted_trades_ts) - 1)
         min_cutoff_ts = sorted_trades_ts[safe_idx]
+        # --- FIX: Ensure max_cutoff leaves room for the largest horizon ---
+        # Otherwise, if T_cutoff is near the end, all horizons are masked as 0.
+        max_horizon = max(horizons) if horizons else 600
+        max_cutoff_ts = sorted_trades_ts[-1] - max_horizon
+        # Safety: Ensure max_cutoff_ts >= min_cutoff_ts
+        if max_cutoff_ts < min_cutoff_ts:
+            # Token duration is too short for the horizons, use earliest valid cutoff
+            max_cutoff_ts = min_cutoff_ts
         if max_cutoff_ts <= min_cutoff_ts:
              sample_offset_ts = min_cutoff_ts
         else:
+             # Standard case: sample uniformly between [Trade[24], LastTrade - max_horizon]
              sample_offset_ts = random.uniform(min_cutoff_ts, max_cutoff_ts)
         T_cutoff = datetime.datetime.fromtimestamp(sample_offset_ts, tz=datetime.timezone.utc)
         raw_data['snapshots_5m'] = snapshot_stats
         raw_data['holder_snapshots_list'] = holder_snapshots_list # Save the list
+        raw_data['holder_snapshots_list'] = holder_snapshots_list # Save the list
         raw_data["protocol_id"] = initial_mint_record.get("protocol")
         return raw_data

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2faeb4a20390db85ca6a4f09d609f56da11266084aa0550fe7861de2dee2da4f
-size 556

 version https://git-lfs.github.com/spec/v1
+oid sha256:847193fc90f4b0313f515ea38a24fd073be09188cfc4764c5dce3f658d4dc117
+size 1660

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:df6cd6a1404a931ba4869d7eaf6e6a564e98b0a87f04d8edf8f6189aebfdeab4
-size 20694

 version https://git-lfs.github.com/spec/v1
+oid sha256:10917f8ad8d8962a8c05a46f2b24dcb1180b23665d0767ea5c65c63d9ec09c92
+size 314966

models/graph_updater.py CHANGED Viewed

@@ -352,6 +352,11 @@ class GraphUpdater(nn.Module):
         self.norm = nn.LayerNorm(node_dim)
         self.to(dtype) # Move norm layer and ModuleList container
     def _build_edge_groups(self) -> Dict[tuple, List[str]]:
         """Group relations by (src_type, dst_type) so conv weights can be shared."""
         groups: Dict[tuple, List[str]] = defaultdict(list)

         self.norm = nn.LayerNorm(node_dim)
         self.to(dtype) # Move norm layer and ModuleList container
+        # Log params
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"[GraphUpdater] Params: {total_params:,} (Trainable: {trainable_params:,})")
     def _build_edge_groups(self) -> Dict[tuple, List[str]]:
         """Group relations by (src_type, dst_type) so conv weights can be shared."""
         groups: Dict[tuple, List[str]] = defaultdict(list)

models/helper_encoders.py CHANGED Viewed

@@ -33,6 +33,11 @@ class ContextualTimeEncoder(nn.Module):
         # Cast the entire module to the specified dtype
         self.to(dtype)
     def _sinusoidal_encode(self, values: torch.Tensor, d_model: int) -> torch.Tensor:
         device = values.device
         half_dim = d_model // 2

         # Cast the entire module to the specified dtype
         self.to(dtype)
+        # Log params
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"[ContextualTimeEncoder] Params: {total_params:,} (Trainable: {trainable_params:,})")
     def _sinusoidal_encode(self, values: torch.Tensor, d_model: int) -> torch.Tensor:
         device = values.device
         half_dim = d_model // 2

models/model.py CHANGED Viewed

@@ -375,14 +375,6 @@ class Oracle(nn.Module):
         # 1a. Encode Tokens
         # --- FIXED: Check for a key that still exists ---
         if token_encoder_inputs['name_embed_indices'].numel() > 0:
-            # --- AGGRESSIVE LOGGING ---
-            print("\n--- [Oracle DynamicEncoder LOG] ---")
-            print(f"[Oracle LOG] embedding_pool shape: {embedding_pool.shape}")
-            print(f"[Oracle LOG] name_embed_indices (shape {token_encoder_inputs['name_embed_indices'].shape}):\n{token_encoder_inputs['name_embed_indices']}")
-            print(f"[Oracle LOG] symbol_embed_indices (shape {token_encoder_inputs['symbol_embed_indices'].shape}):\n{token_encoder_inputs['symbol_embed_indices']}")
-            print(f"[Oracle LOG] image_embed_indices (shape {token_encoder_inputs['image_embed_indices'].shape}):\n{token_encoder_inputs['image_embed_indices']}")
-            print("--- [Oracle LOG] Calling F.embedding and TokenEncoder... ---")
-            # --- END LOGGING ---
             # --- NEW: Gather pre-computed embeddings and pass to encoder ---
             # --- CRITICAL FIX: Remove keys that are not part of the TokenEncoder's signature ---
             encoder_args = token_encoder_inputs.copy()

         # 1a. Encode Tokens
         # --- FIXED: Check for a key that still exists ---
         if token_encoder_inputs['name_embed_indices'].numel() > 0:
             # --- NEW: Gather pre-computed embeddings and pass to encoder ---
             # --- CRITICAL FIX: Remove keys that are not part of the TokenEncoder's signature ---
             encoder_args = token_encoder_inputs.copy()

models/multi_modal_processor.py CHANGED Viewed

@@ -11,6 +11,8 @@ import os
 import traceback
 import numpy as np
 # Suppress warnings
 os.environ["TRANSFORMERS_VERBOSITY"] = "error"
@@ -22,6 +24,10 @@ class MultiModalEncoder:
     """
     def __init__(self, model_id="google/siglip-so400m-patch16-256-i18n", dtype: torch.dtype = torch.bfloat16, device: str = None):
         self.model_id = model_id
         if device:
             self.device = device
@@ -72,46 +78,24 @@ class MultiModalEncoder:
         autocast_dtype = self.dtype if self.dtype in [torch.float16, torch.bfloat16] else None
-        print(f"\n[MME LOG] ENTERING __call__ for {'TEXT' if is_text else 'IMAGE'} batch of size {len(x)}")
-        print(f"[MME LOG] Input data preview: {str(x[0])[:100] if is_text else x[0]}")
-        with torch.amp.autocast(device_type=self.device, enabled=(self.device == 'cuda' and autocast_dtype is not None), dtype=autocast_dtype):
             try:
                 if is_text:
-                    inputs = self.processor(
-                        text=x,
-                        return_tensors="pt",
-                        padding="max_length",
-                        truncation=True
-                    ).to(self.device)
-                    print(f"[MME LOG] Text processor output shape: {inputs['input_ids'].shape}")
                     embeddings = self.model.get_text_features(**inputs)
                 else:
-                    rgb_images = [img.convert("RGB") if img.mode != 'RGB' else img for img in x]
-                    inputs = self.processor(
-                        images=rgb_images,
-                        return_tensors="pt"
-                    ).to(self.device)
-                    if 'pixel_values' in inputs and inputs['pixel_values'].dtype != self.dtype:
-                        inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
                     embeddings = self.model.get_image_features(**inputs)
-                print(f"[MME LOG] Raw model output embeddings shape: {embeddings.shape}, dtype: {embeddings.dtype}")
-                # <<< THIS IS THE FIX. I accidentally removed this.
                 # Normalize in float32 for numerical stability
                 embeddings = F.normalize(embeddings.float(), p=2, dim=-1)
-                print(f"[MME LOG] Normalized embeddings shape: {embeddings.shape}, dtype: {embeddings.dtype}")
                 final_embeddings = embeddings.to(self.dtype)
-                print(f"[MME LOG] Final embeddings shape: {final_embeddings.shape}, dtype: {final_embeddings.dtype}. EXITING __call__.")
                 return final_embeddings
             except Exception as e:
-                print(f"❌ [MME LOG] FATAL ERROR during encoding {'text' if is_text else 'images'}: {e}")
-                traceback.print_exc()
                 return torch.empty(0, self.embedding_dim).to(self.device)
 # --- Test block (SigLIP) ---

 import traceback
 import numpy as np
+from transformers.utils import logging as hf_logging
 # Suppress warnings
 os.environ["TRANSFORMERS_VERBOSITY"] = "error"
     """
     def __init__(self, model_id="google/siglip-so400m-patch16-256-i18n", dtype: torch.dtype = torch.bfloat16, device: str = None):
+        # Force silence progress bars locally for this class
+        hf_logging.set_verbosity_error()
+        hf_logging.disable_progress_bar()
         self.model_id = model_id
         if device:
             self.device = device
         autocast_dtype = self.dtype if self.dtype in [torch.float16, torch.bfloat16] else None
+        with torch.autocast(device_type=self.device, dtype=autocast_dtype, enabled=(autocast_dtype is not None)):
             try:
                 if is_text:
+                    inputs = self.processor(text=x, return_tensors="pt", padding=True, truncation=True).to(self.device)
                     embeddings = self.model.get_text_features(**inputs)
                 else:
+                    inputs = self.processor(images=x, return_tensors="pt").to(self.device)
                     embeddings = self.model.get_image_features(**inputs)
                 # Normalize in float32 for numerical stability
                 embeddings = F.normalize(embeddings.float(), p=2, dim=-1)
                 final_embeddings = embeddings.to(self.dtype)
                 return final_embeddings
             except Exception as e:
+                # Silently fail or log debug only if needed
+                # traceback.print_exc()
                 return torch.empty(0, self.embedding_dim).to(self.device)
 # --- Test block (SigLIP) ---

models/ohlc_embedder.py CHANGED Viewed

@@ -71,6 +71,11 @@ class OHLCEmbedder(nn.Module):
         self.to(dtype)
     def forward(self, x: torch.Tensor, interval_ids: torch.Tensor) -> torch.Tensor:
         """
         Args:

         self.to(dtype)
+        # Log params
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"[OHLCEmbedder] Params: {total_params:,} (Trainable: {trainable_params:,})")
     def forward(self, x: torch.Tensor, interval_ids: torch.Tensor) -> torch.Tensor:
         """
         Args:

models/token_encoder.py CHANGED Viewed

@@ -98,6 +98,11 @@ class TokenEncoder(nn.Module):
         device = "cuda" if torch.cuda.is_available() else "cpu"
         self.to(device=device, dtype=dtype)
     def forward(
         self,
         name_embeds: torch.Tensor,
@@ -123,21 +128,14 @@ class TokenEncoder(nn.Module):
         device = name_embeds.device
         batch_size = name_embeds.shape[0]
-        # 2. Get Protocol embedding (small)
-        print(f"\n--- [TokenEncoder LOG] ENTERING FORWARD PASS (Batch Size: {batch_size}) ---")
-        print(f"[TokenEncoder LOG] Input protocol_ids (shape {protocol_ids.shape}):\n{protocol_ids}")
-        print(f"[TokenEncoder LOG] Protocol Embedding Vocab Size: {self.protocol_embedding.num_embeddings}")
         protocol_ids_long = protocol_ids.to(device, dtype=torch.long)
         protocol_emb_raw = self.protocol_embedding(protocol_ids_long) # [B, 64]
-        print(f"[TokenEncoder LOG] Raw protocol embeddings shape: {protocol_emb_raw.shape}")
         # NEW: Get vanity embedding
         vanity_ids_long = is_vanity_flags.to(device, dtype=torch.long)
         vanity_emb_raw = self.vanity_embedding(vanity_ids_long) # [B, 32]
         # 3. Project all features to internal_dim (e.g., 1024)
-        print(f"[TokenEncoder LOG] Projecting features to internal_dim: {self.internal_dim}")
         name_emb = self.name_proj(name_embeds)
         symbol_emb = self.symbol_proj(symbol_embeds)
         image_emb = self.image_proj(image_embeds)
@@ -153,16 +151,8 @@ class TokenEncoder(nn.Module):
             vanity_emb, # NEW: Add the vanity embedding to the sequence
         ], dim=1)
-        print(f"[TokenEncoder LOG] Stacked feature_sequence shape: {feature_sequence.shape}")
-        print(f"  - name_emb shape: {name_emb.shape}")
-        print(f"  - symbol_emb shape: {symbol_emb.shape}")
-        print(f"  - image_emb shape: {image_emb.shape}")
-        print(f"  - protocol_emb shape: {protocol_emb.shape}")
-        print(f"  - vanity_emb shape: {vanity_emb.shape}") # ADDED: Log the new vanity embedding shape
         # 5. Create the padding mask (all False, since we have a fixed number of features for all)
         padding_mask = torch.zeros(batch_size, feature_sequence.shape[1], device=device, dtype=torch.bool)
-        print(f"[TokenEncoder LOG] Created padding_mask of shape: {padding_mask.shape}")
         # 6. Fuse the sequence with the Transformer Encoder
         # This returns the [CLS] token output.
@@ -171,12 +161,9 @@ class TokenEncoder(nn.Module):
             item_embeds=feature_sequence,
             src_key_padding_mask=padding_mask
         )
-        print(f"[TokenEncoder LOG] Fused embedding shape after transformer: {fused_embedding.shape}")
         # 7. Project to the final output dimension
         # Shape: [B, output_dim]
         token_vibe_embedding = self.final_projection(fused_embedding)
-        print(f"[TokenEncoder LOG] Final token_vibe_embedding shape: {token_vibe_embedding.shape}")
-        print(f"--- [TokenEncoder LOG] EXITING FORWARD PASS ---\n")
         return token_vibe_embedding

         device = "cuda" if torch.cuda.is_available() else "cpu"
         self.to(device=device, dtype=dtype)
+        # Log params
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"[TokenEncoder] Params: {total_params:,} (Trainable: {trainable_params:,})")
     def forward(
         self,
         name_embeds: torch.Tensor,
         device = name_embeds.device
         batch_size = name_embeds.shape[0]
         protocol_ids_long = protocol_ids.to(device, dtype=torch.long)
         protocol_emb_raw = self.protocol_embedding(protocol_ids_long) # [B, 64]
         # NEW: Get vanity embedding
         vanity_ids_long = is_vanity_flags.to(device, dtype=torch.long)
         vanity_emb_raw = self.vanity_embedding(vanity_ids_long) # [B, 32]
         # 3. Project all features to internal_dim (e.g., 1024)
         name_emb = self.name_proj(name_embeds)
         symbol_emb = self.symbol_proj(symbol_embeds)
         image_emb = self.image_proj(image_embeds)
             vanity_emb, # NEW: Add the vanity embedding to the sequence
         ], dim=1)
         # 5. Create the padding mask (all False, since we have a fixed number of features for all)
         padding_mask = torch.zeros(batch_size, feature_sequence.shape[1], device=device, dtype=torch.bool)
         # 6. Fuse the sequence with the Transformer Encoder
         # This returns the [CLS] token output.
             item_embeds=feature_sequence,
             src_key_padding_mask=padding_mask
         )
         # 7. Project to the final output dimension
         # Shape: [B, output_dim]
         token_vibe_embedding = self.final_projection(fused_embedding)
         return token_vibe_embedding

models/wallet_encoder.py CHANGED Viewed

@@ -95,6 +95,28 @@ class WalletEncoder(nn.Module):
         )
         self.to(dtype)
     def _build_mlp(self, in_dim, out_dim):
         return nn.Sequential(
             nn.Linear(in_dim, out_dim * 2),

         )
         self.to(dtype)
+        # Log params (excluding the shared encoder which might be huge and already logged)
+        # Note: self.encoder is external, but if we include it here, it will double count.
+        # Ideally we only log *this* module's params.
+        my_params = sum(p.numel() for p in self.parameters())
+        # To avoid double counting the external encoder if it's a submodule (it is assigned to self.encoder)
+        # But wait, self.encoder IS a submodule.
+        # We should subtract it if we just want "WalletEncoder specific" params, or clarify.
+        # Let's verify if self.encoder params are included in self.parameters().
+        # Yes they are because `self.encoder = encoder` assigns it.
+        # Actually `encoder` is passed in. If `MultiModalEncoder` is an `nn.Module` (it is NOT), then it would be registered.
+        # `MultiModalEncoder` is a wrapper class, NOT an `nn.Module`.
+        # However, it contains `self.model` which is an `nn.Module`.
+        # But `WalletEncoder` stores `self.encoder = encoder`.
+        # Since `MultiModalEncoder` is not an `nn.Module`, `self.encoder` is just a standard attribute.
+        # So `self.parameters()` of `WalletEncoder` will NOT include `MultiModalEncoder` params.
+        # EXCEPT... we don't know if `MultiModalEncoder` subclassed `nn.Module`.
+        # I checked earlier: `class MultiModalEncoder:` -> No `nn.Module`.
+        # So we are safe. `self.parameters()` will only be the MLPs and SetEncoders defined in WalletEncoder.
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"[WalletEncoder] Params: {my_params:,} (Trainable: {trainable_params:,})")
     def _build_mlp(self, in_dim, out_dim):
         return nn.Sequential(
             nn.Linear(in_dim, out_dim * 2),

scripts/cache_dataset.py CHANGED Viewed

@@ -2,12 +2,20 @@
 import os
 import sys
 import argparse
 import datetime
 import torch
 import json
 from pathlib import Path
 from tqdm import tqdm
 from dotenv import load_dotenv
 # Add parent directory to path to import modules
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -19,9 +27,84 @@ from scripts.analyze_distribution import get_return_class_map
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
 def main():
     load_dotenv()
     parser = argparse.ArgumentParser(description="Cache dataset samples for training.")
     parser.add_argument("--output_dir", type=str, default="data/cache", help="Directory to save cached samples")
     parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to generate")
@@ -50,6 +133,9 @@ def main():
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
         # --- 2. Initialize DataFetcher and OracleDataset ---
         data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)

 import os
 import sys
 import argparse
+import numpy as np
 import datetime
 import torch
 import json
 from pathlib import Path
 from tqdm import tqdm
 from dotenv import load_dotenv
+import huggingface_hub
+import logging
+# Suppress noisy libraries
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
 # Add parent directory to path to import modules
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from clickhouse_driver import Client as ClickHouseClient
 from neo4j import GraphDatabase
+def compute_save_ohlc_stats(client: ClickHouseClient, output_path: str):
+    """
+    Computes global mean/std for price/volume from ClickHouse and saves to .npz
+    This allows the dataset loader to normalize inputs correctly.
+    """
+    print(f"INFO: Computing OHLC stats (mean/std) from ClickHouse...")
+    # Query matching preprocess_distribution.py logic
+    # We use hardcoded min_price/vol filters to avoid skewing stats with dust
+    min_price = 0.0
+    min_vol = 0.0
+    query = """
+        SELECT
+            AVG(t.price_usd)         AS mean_price_usd,
+            stddevPop(t.price_usd)   AS std_price_usd,
+            AVG(t.price)             AS mean_price_native,
+            stddevPop(t.price)       AS std_price_native,
+            AVG(t.total_usd)         AS mean_trade_value_usd,
+            stddevPop(t.total_usd)   AS std_trade_value_usd
+        FROM trades AS t
+        WHERE t.price_usd > %(min_price)s AND t.total_usd > %(min_vol)s
+    """
+    params = {"min_price": min_price, "min_vol": min_vol}
+    try:
+        result = client.execute(query, params=params)
+        if not result or not result[0]:
+            print("WARNING: Stats query returned no rows. Using default identity stats.")
+            stats = {
+                "mean_price_usd": 0.0, "std_price_usd": 1.0,
+                "mean_price_native": 0.0, "std_price_native": 1.0,
+                "mean_trade_value_usd": 0.0, "std_trade_value_usd": 1.0,
+            }
+        else:
+            row = result[0]
+            # Handle potential None values if DB is empty
+            def safe_float(x, default=0.0):
+                return float(x) if x is not None else default
+            def safe_std(x):
+                val = safe_float(x, 1.0)
+                return val if val > 1e-9 else 1.0
+            stats = {
+                "mean_price_usd": safe_float(row[0]),
+                "std_price_usd": safe_std(row[1]),
+                "mean_price_native": safe_float(row[2]),
+                "std_price_native": safe_std(row[3]),
+                "mean_trade_value_usd": safe_float(row[4]),
+                "std_trade_value_usd": safe_std(row[5]),
+            }
+        # Save to NPZ
+        out_p = Path(output_path)
+        out_p.parent.mkdir(parents=True, exist_ok=True)
+        np.savez(out_p, **stats)
+        print(f"INFO: Saved OHLC stats to {out_p}")
+        for k, v in stats.items():
+            print(f"  {k}: {v:.4f}")
+    except Exception as e:
+        print(f"ERROR: Failed to compute OHLC stats: {e}")
+        # Don't crash, let it try to proceed (though dataset might complain if file missing)
 def main():
     load_dotenv()
+    # Explicit Login
+    hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        print(f"INFO: Logging in to Hugging Face with token starting with: {hf_token[:4]}...")
+        huggingface_hub.login(token=hf_token)
+    else:
+        print("WARNING: HF_TOKEN not found in environment.")
     parser = argparse.ArgumentParser(description="Cache dataset samples for training.")
     parser.add_argument("--output_dir", type=str, default="data/cache", help="Directory to save cached samples")
     parser.add_argument("--max_samples", type=int, default=None, help="Maximum number of samples to generate")
     neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
     try:
+        # --- 1. Compute OHLC Stats (Global) ---
+        compute_save_ohlc_stats(clickhouse_client, args.ohlc_stats_path)
         # --- 2. Initialize DataFetcher and OracleDataset ---
         data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)

train.py CHANGED Viewed

@@ -15,6 +15,14 @@ resolved_tmp = str(_DEFAULT_TMP.resolve())
 for key in ("TMPDIR", "TMP", "TEMP"):
     os.environ.setdefault(key, resolved_tmp)
 try:
     mp.set_start_method('spawn', force=True)
 except RuntimeError:
@@ -100,8 +108,8 @@ def quantile_pinball_loss(preds: torch.Tensor,
         # Preds shape: [B, Horizons * Quantiles]
         # Logic assumes interleaved outputs or consistent flattening.
         pred_slice = preds[:, idx::num_quantiles]
-        target_slice = targets[:, idx::num_quantiles]
-        mask_slice = mask[:, idx::num_quantiles]
         diff = target_slice - pred_slice
         pinball = torch.maximum((q - 1.0) * diff, q * diff)
@@ -118,6 +126,44 @@ def filtered_collate(collator: MemecoinCollator,
         return None
     return collator(batch)
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Train the Oracle quantile model.")
@@ -209,6 +255,25 @@ def main() -> None:
     max_seq_len = args.max_seq_len
     max_seq_len = args.max_seq_len
     logger.info(f"Initializing Encoders with dtype={init_dtype}...")
     # Encoders
@@ -423,6 +488,11 @@ def main() -> None:
             # Logging
             if accelerator.sync_gradients:
                 total_steps += 1
                 current_loss = loss.item()
                 epoch_loss += current_loss
                 valid_batches += 1

 for key in ("TMPDIR", "TMP", "TEMP"):
     os.environ.setdefault(key, resolved_tmp)
+# --- Environment & Logging Setup ---
+from dotenv import load_dotenv
+import huggingface_hub
+from transformers.utils import logging as hf_logging
+# Load .env explicitly (benign at global scope but moving heavy lifting to main)
+load_dotenv()
 try:
     mp.set_start_method('spawn', force=True)
 except RuntimeError:
         # Preds shape: [B, Horizons * Quantiles]
         # Logic assumes interleaved outputs or consistent flattening.
         pred_slice = preds[:, idx::num_quantiles]
+        target_slice = targets
+        mask_slice = mask
         diff = target_slice - pred_slice
         pinball = torch.maximum((q - 1.0) * diff, q * diff)
         return None
     return collator(batch)
+def log_debug_batch_context(batch: Dict[str, Any], logger: logging.Logger, step: int):
+    """
+    Logs decoded event sequence and labels for the first sample in the batch.
+    Use this to verify what the model is actually seeing.
+    """
+    if not logger.isEnabledFor(logging.INFO): return
+    try:
+        # Only look at the first sample in batch
+        idx = 0
+        event_ids = batch['event_type_ids'][idx].cpu() # [L]
+        labels = batch['labels'][idx].cpu() # [Horizons * Quantiles]
+        mask = batch['labels_mask'][idx].cpu()
+        # Decode events
+        events = []
+        for eid in event_ids:
+            eid_val = eid.item()
+            if eid_val == 0: continue # Skip PAD
+            # Get name from vocab
+            name = vocab.ID_TO_EVENT.get(eid_val, f"UNK_{eid_val}")
+            events.append(name)
+        logger.info(f"\n--- [Step {step}] Batch Input Preview (Sample 0) ---")
+        # Show a slice of events (e.g. last 50)
+        tail_len = 50
+        context_str = ", ".join(events[-tail_len:])
+        logger.info(f"Event Stream (Last {tail_len} of {len(events)}): [{context_str}]")
+        # Show Labels
+        # Assuming flattened labels [H*Q]
+        logger.info(f"Labels (First 10): {labels[:10].tolist()}")
+        logger.info(f"Masks  (First 10): {mask[:10].tolist()}")
+        logger.info("----------------------------------------------------\n")
+    except Exception as e:
+        logger.warning(f"Failed to log batch context: {e}")
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Train the Oracle quantile model.")
     max_seq_len = args.max_seq_len
     max_seq_len = args.max_seq_len
+    # --- Environment & Logging Setup ---
+    # Load .env explicitly
+    load_dotenv()
+    # Suppress noisy libraries
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+    logging.getLogger("transformers").setLevel(logging.ERROR)
+    logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
+    hf_logging.disable_progress_bar()
+    # Explicit Login
+    hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        print(f"Logging in to Hugging Face with token starting with: {hf_token[:4]}...")
+        huggingface_hub.login(token=hf_token)
+    else:
+        print("WARNING: HF_TOKEN not found in environment.")
     logger.info(f"Initializing Encoders with dtype={init_dtype}...")
     # Encoders
             # Logging
             if accelerator.sync_gradients:
                 total_steps += 1
+                # --- NEW: Debug Log Batch Context ---
+                if total_steps % log_every == 0 and accelerator.is_main_process:
+                     log_debug_batch_context(batch, logger, total_steps)
                 current_loss = loss.item()
                 epoch_loss += current_loss
                 valid_batches += 1