zirobtc commited on Jan 21

Commit

858826c

1 Parent(s): 3596954

Upload folder using huggingface_hub

Browse files

Files changed (38) hide show

.gitattributes +1 -0
.gitignore +12 -0
FullCryptoGuide.md +0 -0
README.md +776 -0
data/data_collator.py +708 -0
data/data_fetcher.py +1009 -0
data/data_loader.py +1657 -0
data/ohlc_stats.npz +3 -0
data/preprocess_distribution.py +164 -0
graph_schema.rs +115 -0
inference.py +271 -0
link_graph.rs +2275 -0
log.log +3 -0
models/HoldersEncoder.py +81 -0
models/SocialEncoders.py +245 -0
models/__init__.py +0 -0
models/graph_updater.py +486 -0
models/helper_encoders.py +87 -0
models/model.py +1009 -0
models/multi_modal_processor.py +184 -0
models/ohlc_embedder.py +114 -0
models/token_encoder.py +182 -0
models/vocabulary.py +188 -0
models/wallet_encoder.py +262 -0
models/wallet_set_encoder.py +99 -0
neo4j.rs +121 -0
ohlc_stats.npz +3 -0
onchain.sql +599 -0
pre_cache.sh +6 -0
scripts/cache_dataset.py +148 -0
scripts/download_epoch_artifacts.py +96 -0
scripts/ingest_epoch.py +713 -0
train.py +465 -0
train.sh +23 -0
train.yaml +30 -0
utils.sql +69 -0
validate.py +210 -0
validate.sh +6 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+log.log filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+# Ignore the __pycache__ directory anywhere in the repository
+__pycache__/
+# Ignore all .txt files anywhere in the repository
+*.txt
+# Ignore the 'runs' directory anywhere in the repository, regardless of nesting
+runs/
+data/pump_fun
+.env

FullCryptoGuide.md ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md ADDED Viewed

	@@ -0,0 +1,776 @@

+# =========================================
+# Entity Encoders
+# =========================================
+# These are generated offline/streaming and are the "vocabulary" for the model.
+<WalletEmbedding>       # Embedding of a wallet's relationships, behavior, and history.
+<WalletEmbedding> = [
+    // Data from the 'wallet_profiles' table (Wallet-level lifetime and daily/weekly stats)
+    wallet_profiles_row: [
+        // Core Info & Timestamps
+        age,                    // No Contextual
+        wallet_address,         // Primary wallet identifier
+        // 7. NEW: Deployed Token Aggregates (8 Features)
+        deployed_tokens_count,          // Total tokens created
+        deployed_tokens_migrated_pct, // % that migrated
+        deployed_tokens_avg_lifetime_sec, // Avg duration before dev selling
+        deployed_tokens_avg_peak_mc_usd,  // Avg peak marketcap
+        deployed_tokens_median_peak_mc_usd,
+        // Metadata & Balances
+        balance,                // Current SOL balance
+        // Lifetime Transaction Counts (Total history)
+        transfers_in_count,     // Total native transfers received
+        transfers_out_count,    // Total native transfers sent
+        spl_transfers_in_count, // Total SPL token transfers received
+        spl_transfers_out_count,// Total SPL token transfers sent
+        // Lifetime Trading Stats (Total history)
+        total_buys_count,       // Total buys across all tokens
+        total_sells_count,      // Total sells across all tokens
+        total_winrate,          // Overall trading winrate
+        // 1-Day Stats (Realized P&L, Counts, Averages, Volume, Fees, Winrate)
+        stats_1d_realized_profit_sol,
+        stats_1d_realized_profit_pnl,
+        stats_1d_buy_count,
+        stats_1d_sell_count,
+        stats_1d_transfer_in_count,
+        stats_1d_transfer_out_count,
+        stats_1d_avg_holding_period,
+        stats_1d_total_bought_cost_sol,
+        stats_1d_total_sold_income_sol,
+        stats_1d_total_fee,
+        stats_1d_winrate,
+        stats_1d_tokens_traded,
+        // 7-Day Stats (Realized P&L, Counts, Averages, Volume, Fees, Winrate)
+        stats_7d_realized_profit_sol,
+        stats_7d_realized_profit_pnl,
+        stats_7d_buy_count,
+        stats_7d_sell_count,
+        stats_7d_transfer_in_count,
+        stats_7d_transfer_out_count,
+        stats_7d_avg_holding_period,
+        stats_7d_total_bought_cost_sol,
+        stats_7d_total_sold_income_sol,
+        stats_7d_total_fee,
+        stats_7d_winrate,
+        stats_7d_tokens_traded,
+        // 30 Days is to useless in the context
+    ],
+    // Data from the 'wallet_socials' table (Social media and profile info)
+    wallet_socials_row: [
+        has_pf_profile,
+        has_twitter,
+        has_telegram,
+        is_exchange_wallet,
+        username,
+    ],
+    // Data from the 'wallet_holdings' table (Token-level statistics for held tokens)
+    wallet_holdings_pool: [
+        <TokenVibeEmbedding>,
+        holding_time,       // How much he held the token (We check only tokens that currently is holding, or recently traded)
+        balance_pct_to_supply,        // Current quantity of the token held
+        // History (Amounts & Costs)
+        history_bought_amount_sol,  // Total amount of token bought
+        bought_amount_sol_pct_to_native_balance // Is he traded a lot of his wallet size
+        // History (Counts)
+        history_total_buys,     // Total number of buy transactions
+        history_total_sells,    // Total number of sell transactions
+        // Profit and Loss
+        realized_profit_pnl,    // Realized P&L as a percentage
+        realized_profit_sol,
+        // Transfers (Non-trade movements)
+        history_transfer_in,
+        history_transfer_out,
+        avarage_trade_gap_seconds,
+        total_priority_fees, // Total tips + Priority Fees
+    ]
+]
+<TokenVibeEmbedding>    # Multimodal embedding of a token's identity
+<TokenVibeEmbedding> = [<TokenAddressEmbedding>, <NameEmbedding>, <SymbolEmbedding>, <ImageEmbedding>, protocol_id]
+<TextEmbedding>         # Text embedding MultiModal processor.
+<MediaEmbedding>        # Multimodal VIT encoder.
+# -----------------------------------------
+# 1. TradeEncoder
+# -----------------------------------------
+# Captures large-size trades from any wallet.
+[timestamp, 'LargeTrade', relative_ts, <WalletEmbedding>, trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# Captures the high-signal "Dev Sold or Bought" event.
+[timestamp, 'Deployer_Trade', relative_ts, <CreatorWalletEmbedding>,  trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# Captures *all* trades from pre-defined high-P&L/win-rate, kol and known wallets.
+[timestamp, 'SmartWallet_Trade', relative_ts, <TraderWalletEmbedding>, trade_direction, sol_amount,  dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# Raw trades. Loaded in H/B/H Prefix (first ~10k) and Suffix (last ~5k).
+[timestamp, 'Trade', relative_ts, <TraderWalletEmbedding>, trade_direction, sol_amount, dex_platform_id, priority_fee, mev_protection, token_amount_pct_of_holding, quote_amount_pct_of_holding, slippage, price_impact, success, is_bundle, total_usd]
+# -----------------------------------------
+# 2. TransferEncoder
+# -----------------------------------------
+# Raw transfers. Loaded in H/B/H Prefix (all in first ~10k trade window) and Suffix (all in last ~5k trade window).
+[timestamp, 'Transfer', relative_ts, <SourceWalletEmbedding>, <DestinationWalletEmbedding>, token_amount, transfer_pct_of_total_supply, transfer_pct_of_holding, priority_fee]
+# Captures scarce, large transfers *after* the initial launch window.
+[timestamp, 'LargeTransfer', relative_ts, <FromWalletEmbedding>, <ToWalletEmbedding>, token_amount, transfer_pct_of_total_supply, transfer_pct_of_holding, priority_fee]
+# -----------------------------------------
+# 3. LifecycleEncoder
+# -----------------------------------------
+# The T0 event.
+[timestamp, 'Mint', 0, <CreatorWalletEmbedding>, <TokenVibeEmbedding>]
+# -----------------------------------------
+# 3. PoolEncoder
+# -----------------------------------------
+# Signals migration from launchpad to a real pool.
+[timestamp, 'PoolCreated', relative_ts, <ProviderWalletEmbedding>, protocol_id, <QuoteTokenVibeEmbedding>, base_amount, quote_amount, quote_pct_to_main_pool_balance, base_pct_to_main_pool_balance]
+# Signals LP addition or removal.
+[timestamp, 'LiquidityChange', relative_ts, <ProviderWalletEmbedding>, <QuoteTokenVibeEmbedding>, change_type_id, quote_amount, quote_pct_to_current_pool_balance]
+# Signals creator/dev taking platform fees.
+[timestamp, 'FeeCollected', relative_ts, <RecipientWalletEmbedding>, sol_amount, token_amount]
+# -----------------------------------------
+# SupplyEncoder
+# -----------------------------------------
+# Signals a supply reduction.
+[timestamp, 'TokenBurn', relative_ts, <BurnerWalletEmbedding>, amount_pct_of_total_supply, amount_tokens_burned]
+# Signals locked supply, e.g., for team/marketing.
+[timestamp, 'SupplyLock', relative_ts, <LockerWalletEmbedding>, amount_pct_of_total_supply, lock_duration]
+# -----------------------------------------
+#  ChartEncoder
+# -----------------------------------------
+# (The "Sliding Window") This is the new chart event.
+[timestamp, 'Chart_Segment', relative_ts, OHLC_segment, chart_interval_id]
+# -----------------------------------------
+#  PulseEncoder
+# -----------------------------------------
+# It is a low-frequency event (Dynamic Interval: 5min, 15min, or 1hr based on token age).
+[timestamp, 'OnChain_Snapshot', relative_ts, total_holders, smart_traders, kols, holder_growth_rate, top_10_holder_pct, sniper_holding_pct, rat_wallets_holding_pct, bundle_holding_pct, current_market_cap, liquidity, volume, buy_count, sell_count, total_txns, global_fees_paid]
+# -----------------------------------------
+#  HoldersListEncoder
+# -----------------------------------------
+<HolderDistributionEmbedding> # Transformer-based embedding of the top holders (WalletEmbeddings + Pct).
+# Token-specific holder analysis.
+[timestamp, 'HolderSnapshot', relative_ts, <HolderDistributionEmbedding>]
+# -----------------------------------------
+# ChainSnapshotEncoder
+# -----------------------------------------
+# Broad chain-level market conditions.
+[timestamp, 'ChainSnapshot', relative_ts, native_token_price_usd, gas_fee]
+# Launchpad market regime (using absolute, log-normalized values).
+[timestamp, 'Lighthouse_Snapshot', relative_ts, protocol_id, timeframe_id, total_volume, total_transactions, total_traders, total_tokens_created, total_migrations]
+# -----------------------------------------
+# TokenTrendingListEncoder
+# -----------------------------------------
+# Fires *per token* on a trending list. The high-attention "meta" signal.
+[timestamp, 'TrendingToken', relative_ts, <TokenVibeEmbedding_of_trending_token>, list_source_id, timeframe_id, rank]
+# Fires *per token* on the boosted list.
+[timestamp, 'BoostedToken', relative_ts, <TokenVibeEmbedding_of_boosted_token>, total_boost_amount, rank]
+# -----------------------------------------
+# LaunchpadTheadEncoder
+# -----------------------------------------
+# On-platform social signal (Pump.fun comments).
+[timestamp, 'PumpReply', relative_ts, <UserWalletEmbedding>, <ReplyTextEmbedding>]
+# -----------------------------------------
+# CTEncoder
+# -----------------------------------------
+# Off-platform social signal (Twitter).
+[timestamp, 'XPost', relative_ts, <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>]
+[timestamp, 'XRetweet', relative_ts, <RetweeterWalletEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding>]
+[timestamp, 'XReply', relative_ts, <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>, <MainTweetEmbedding>]
+[timestamp, 'XQuoteTweet', relative_ts, <QuoterWalletEmbedding>, <QuoterTextEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding>]
+# -----------------------------------------
+# GlobalTrendingEncoder
+# -----------------------------------------
+# Broader cultural trend signal (TikTok).
+[timestamp, 'TikTok_Trending_Hashtag', relative_ts, <HashtagNameEmbedding>, rank]
+# Broader cultural trend signal (Twitter).
+[timestamp, 'XTrending_Hashtag', relative_ts, <HashtagNameEmbedding>, rank]
+# -----------------------------------------
+# TrackerEncoder
+# -----------------------------------------
+# Retail marketing signal (Paid groups).
+[timestamp, 'AlphaGroup_Call', relative_ts, group_id]
+[timestamp, 'Call_Channel', relative_ts, channel_id]
+# High-impact catalyst event.
+[timestamp, 'CexListing', relative_ts, exchange_id]
+# High-impact catalyst event.
+[timestamp, 'Migrated', relative_ts, protocol_id]
+# -----------------------------------------
+# Dex Encoder
+# -----------------------------------------
+[timestamp, 'DexBoost_Paid', relative_ts, amount, total_amount_on_token]
+[timestamp, 'DexProfile_Updated', relative_ts, has_changed_website_flag, has_changed_twitter_flag, has_changed_telegram_flag, has_changed_description_flag, <WebsiteEmbedding>, <TwitterLinkEmbedding>, <NewDescriptionEmbeeded>]
+### **Global Context Injection**
+<PRELAUNCH> <LAUNCH> <Middle> <RECENT>
+### **Token Role Embedding**
+<TokenVibeEmbedding_of_Token_A> + Subject_Token_Role
+<TokenVibeEmbedding_of_Token_B> + Trending_Token_Role
+<QuoteTokenVibeEmbedding_of_USDC> + Quote_Token_Role
+# **Links**
+### `TransferLink`
+```
+['signature', 'source', 'destination', 'mint', 'timestamp']
+```
+-----
+### `BundleTradeLink`
+```
+['signatures', 'wallet_a', 'wallet_b', 'mint', 'slot', 'timestamp']
+```
+-----
+### `CopiedTradeLink`
+```
+['leader_buy_sig', 'leader_sell_sig', 'follower_buy_sig', 'follower_sell_sig', 'follower', 'leader', 'mint', 'time_gap_on_buy_sec', 'time_gap_on_sell_sec', 'leader_pnl', 'follower_pnl', 'leader_buy_total', 'leader_sell_total', 'follower_buy_total', 'follower_sell_total', 'follower_buy_slippage', 'follower_sell_slippage']
+```
+-----
+### `CoordinatedActivityLink`
+```
+['leader_first_sig', 'leader_second_sig', 'follower_first_sig', 'follower_second_sig', 'follower', 'leader', 'mint', 'time_gap_on_first_sec', 'time_gap_on_second_sec']
+```
+-----
+### `MintedLink`
+```
+['signature', 'timestamp', 'buy_amount']
+```
+-----
+### `SnipedLink`
+```
+['signature', 'rank', 'sniped_amount']
+```
+-----
+### `LockedSupplyLink`
+```
+['signature', 'amount', 'unlock_timestamp']
+```
+-----
+### `BurnedLink`
+```
+['signature', 'amount', 'timestamp']
+```
+-----
+### `ProvidedLiquidityLink`
+```
+['signature', 'wallet', 'token', 'pool_address', 'amount_base', 'amount_quote', 'timestamp']
+```
+-----
+### `WhaleOfLink`
+```
+['wallet', 'token', 'holding_pct_at_creation', 'ath_usd_at_creation']
+```
+-----
+### `TopTraderOfLink`
+```
+['wallet', 'token', 'pnl_at_creation', 'ath_usd_at_creation']
+```
+/////
+    def __gettestitem__(self, idx: int) -> Dict[str, Any]:
+        """
+        Generates a single complex data item, structured for the MemecoinCollator.
+        NOTE: This currently returns the same mock data regardless of `idx`.
+        """
+        # --- 1. Setup Pooler and Define Raw Data ---
+        pooler = EmbeddingPooler()
+         # --- 5. Create Mock Raw Batch Data (FIXED) ---
+        print("Creating mock raw batch...")
+        # (Wallet profiles, socials, holdings definitions are unchanged)
+        profile1 = {
+            'wallet_address': 'addrW1', 'age': 1.5e7, 'balance': 10.5,
+            'deployed_tokens_count': 2, 'deployed_tokens_migrated_pct': 0.5, 'deployed_tokens_avg_lifetime_sec': 36000.0, 'deployed_tokens_avg_peak_mc_usd': 100000.0, 'deployed_tokens_median_peak_mc_usd': 50000.0,
+            'transfers_in_count': 10, 'transfers_out_count': 5, 'spl_transfers_in_count': 20, 'spl_transfers_out_count': 15,
+            'total_buys_count': 50, 'total_sells_count': 40, 'total_winrate': 0.6,
+            'stats_1d_realized_profit_sol': 1.2, 'stats_1d_realized_profit_pnl': 0.1, 'stats_1d_buy_count': 5, 'stats_1d_sell_count': 3, 'stats_1d_transfer_in_count': 2, 'stats_1d_transfer_out_count': 1, 'stats_1d_avg_holding_period': 3600, 'stats_1d_total_bought_cost_sol': 10.0, 'stats_1d_total_sold_income_sol': 11.2, 'stats_1d_total_fee': 0.1, 'stats_1d_winrate': 0.7, 'stats_1d_tokens_traded': 4,
+            'stats_7d_realized_profit_sol': 5.0, 'stats_7d_realized_profit_pnl': 0.2, 'stats_7d_buy_count': 20, 'stats_7d_sell_count': 15, 'stats_7d_transfer_in_count': 8, 'stats_7d_transfer_out_count': 4, 'stats_7d_avg_holding_period': 7200, 'stats_7d_total_bought_cost_sol': 40.0, 'stats_7d_total_sold_income_sol': 45.0, 'stats_7d_total_fee': 0.5, 'stats_7d_winrate': 0.65, 'stats_7d_tokens_traded': 10,
+        }
+        social1 = {'has_pf_profile': True, 'has_twitter': True, 'has_telegram': False, 'is_exchange_wallet': False, 'username': 'trader_one'}
+        holdings1 = [
+            {'mint_address': 'tknA', 'holding_time': 3600.0, 'realized_profit_sol': 5.2, 'total_priority_fees': 0.05, 'balance_pct_to_supply': 0.01, 'history_bought_amount_sol': 10, 'bought_amount_sol_pct_to_native_balance': 0.5, 'history_total_buys': 5, 'history_total_sells': 2, 'realized_profit_pnl': 0.52, 'history_transfer_in': 1, 'history_transfer_out': 0, 'avarage_trade_gap_seconds': 300},
+        ]
+        profile2 = {
+            'wallet_address': 'addrW2', 'age': 1e6, 'balance': 1.0,
+            'deployed_tokens_count': 0, 'deployed_tokens_migrated_pct': 0.0, 'deployed_tokens_avg_lifetime_sec': 0.0, 'deployed_tokens_avg_peak_mc_usd': 0.0, 'deployed_tokens_median_peak_mc_usd': 0.0,
+            'transfers_in_count': 1, 'transfers_out_count': 0, 'spl_transfers_in_count': 0, 'spl_transfers_out_count': 0,
+            'total_buys_count': 0, 'total_sells_count': 0, 'total_winrate': 0.0,
+            'stats_1d_realized_profit_sol': 0.0, 'stats_1d_realized_profit_pnl': 0.0, 'stats_1d_buy_count': 0, 'stats_1d_sell_count': 0, 'stats_1d_transfer_in_count': 0, 'stats_1d_transfer_out_count': 0, 'stats_1d_avg_holding_period': 0, 'stats_1d_total_bought_cost_sol': 0.0, 'stats_1d_total_sold_income_sol': 0.0, 'stats_1d_total_fee': 0.0, 'stats_1d_winrate': 0.0, 'stats_1d_tokens_traded': 0,
+            'stats_7d_realized_profit_sol': 0.0, 'stats_7d_realized_profit_pnl': 0.0, 'stats_7d_buy_count': 0, 'stats_7d_sell_count': 0, 'stats_7d_transfer_in_count': 0, 'stats_7d_transfer_out_count': 0, 'stats_7d_avg_holding_period': 0, 'stats_7d_total_bought_cost_sol': 0.0, 'stats_7d_total_sold_income_sol': 0.0, 'stats_7d_total_fee': 0.0, 'stats_7d_winrate': 0.0, 'stats_7d_tokens_traded': 0,
+        }
+        social2 = {'has_pf_profile': False, 'has_twitter': False, 'has_telegram': False, 'is_exchange_wallet': True, 'username': 'cex_wallet'}
+        holdings2 = []
+        # Define raw data and get their indices
+        tokenA_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        # Add wallet usernames to the pool
+        wallet1_user_idx = pooler.get_idx(social1['username'])
+        wallet2_user_idx = pooler.get_idx(social2['username'])
+        social1['username_emb_idx'] = wallet1_user_idx
+        social2['username_emb_idx'] = wallet2_user_idx
+        # --- NEW: Add a third wallet for social tests ---
+        social3 = {'has_pf_profile': False, 'has_twitter': True, 'has_telegram': True, 'is_exchange_wallet': False, 'username': 'social_butterfly'}
+        wallet3_user_idx = pooler.get_idx(social3['username'])
+        social3['username_emb_idx'] = wallet3_user_idx
+        # Create the final pre-computed data structures
+        tokenB_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        tokenC_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        tokenD_data = {
+            'address_emb_idx': pooler.get_idx('tknA'),
+            'name_emb_idx': pooler.get_idx('Token A'),
+            'symbol_emb_idx': pooler.get_idx('TKA'),
+            'image_emb_idx': pooler.get_idx(Image.new('RGB',(256,256), color='blue')),
+            'protocol': 1
+        }
+        item = {
+                'event_sequence': [
+                     {'event_type': 'XPost', # NEW
+                    'timestamp': 1729711350,
+                    'relative_ts': -25,
+                    'wallet_address': 'addrW1', # Author
+                    'text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
+                    'media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
+                    },
+                    {'event_type': 'XReply', # NEW
+                    'timestamp': 1729711360,
+                    'relative_ts': -35,
+                    'wallet_address': 'addrW2', # Replier
+                    'text_emb_idx': pooler.get_idx('This is a reply to the main tweet'),
+                    'media_emb_idx': pooler.get_idx(None), # No media in reply
+                    'main_tweet_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA')
+                    },
+                    {'event_type': 'XRetweet', # NEW
+                    'timestamp': 1729711370,
+                    'relative_ts': -40,
+                    'wallet_address': 'addrW3', # The retweeter
+                    'original_author_wallet_address': 'addrW1', # The original author
+                    'original_post_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
+                    'original_post_media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
+                    },
+                    # --- CORRECTED: Test a pre-launch event with negative relative_ts ---
+                    {'event_type': 'Transfer',
+                    'timestamp': 1729711180,
+                    'relative_ts': -10, # Negative relative_ts indicates pre-launch
+                    'wallet_address': 'addrW2',
+                    'destination_wallet_address': 'addrW1',
+                    'token_address': 'tknA',
+                    'token_amount': 1000.0, 'transfer_pct_of_total_supply': 0.0, 'transfer_pct_of_holding': 0.0, 'priority_fee': 0.0
+                    },
+                    {'event_type': 'Mint', 'timestamp': 1729711190, 'relative_ts': 0, 'wallet_address': 'addrW1', 'token_address': 'tknA'},
+                    {'event_type': 'Chart_Segment', 'timestamp': 1729711200, 'relative_ts': 60, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'}, # This is high-def (segment 0) by default
+                    {'event_type': 'Chart_Segment', 'timestamp': 1729711260, 'relative_ts': 120, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'}, # You can mark this as blurry
+                    {'event_type': 'Transfer',
+                    'timestamp': 1729711210,
+                    'relative_ts': 20,
+                    'wallet_address': 'addrW1', # Source
+                    'destination_wallet_address': 'addrW2', # Destination
+                    'token_address': 'tknA', # Need token for context? (Optional, depends on design)
+                    'token_amount': 500.0,
+                    'transfer_pct_of_total_supply': 0.005,
+                    'transfer_pct_of_holding': 0.1,
+                    'priority_fee': 0.0001
+                    },
+                    {'event_type': 'Trade',
+                    'timestamp': 1729711220,
+                    'relative_ts': 30,
+                    'wallet_address': 'addrW1',
+                    'token_address': 'tknA',
+                    'trade_direction': 0,
+                    'sol_amount': 0.5,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['Axiom'],
+                    'priority_fee': 0.0002,
+                    'mev_protection': False,
+                    'token_amount_pct_of_holding': 0.05, 'quote_amount_pct_of_holding': 0.02,
+                    'slippage': 0.01, 'price_impact': 0.005, 'success': True, 'is_bundle': False, 'total_usd': 75.0
+                    },
+                    {'event_type': 'Deployer_Trade', # NEW: Testing a trade variant
+                    'timestamp': 1729711230,
+                    'relative_ts': 40,
+                    'wallet_address': 'addrW1', # The creator wallet
+                    'token_address': 'tknA',
+                    'trade_direction': 1, 'sol_amount': 0.2,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['Trojan'],
+                    'priority_fee': 0.0005,
+                    'mev_protection': True,
+                    'token_amount_pct_of_holding': 0.1, 'quote_amount_pct_of_holding': 0.0,
+                    'slippage': 0.02, 'price_impact': 0.01, 'success': True, 'is_bundle': False, 'total_usd': 30.0
+                    },
+                    {'event_type': 'SmartWallet_Trade', # NEW
+                    'timestamp': 1729711240,
+                    'relative_ts': 50,
+                    'wallet_address': 'addrW1', # A known smart wallet
+                    'token_address': 'tknA',
+                    'trade_direction': 0, 'sol_amount': 1.5,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['Axiom'],
+                    'priority_fee': 0.001,
+                    'mev_protection': True,
+                    'token_amount_pct_of_holding': 0.2, 'quote_amount_pct_of_holding': 0.1,
+                    'slippage': 0.01, 'price_impact': 0.008, 'success': True, 'is_bundle': False, 'total_usd': 225.0
+                    },
+                    {'event_type': 'LargeTrade', # NEW
+                    'timestamp': 1729711250,
+                    'relative_ts': 60,
+                    'wallet_address': 'addrW2', # Some other wallet
+                    'token_address': 'tknA',
+                    'trade_direction': 0, 'sol_amount': 10.0,
+                    # --- FIXED: Pass the integer ID directly ---
+                    'dex_platform_id': vocab.DEX_TO_ID['OXK'],
+                    'priority_fee': 0.002,
+                    'mev_protection': False,
+                    'token_amount_pct_of_holding': 0.8, 'quote_amount_pct_of_holding': 0.5,
+                    'slippage': 0.03, 'price_impact': 0.05, 'success': True, 'is_bundle': False, 'total_usd': 1500.0
+                    },
+                    {'event_type': 'Chart_Segment', 'timestamp': 1729711260, 'relative_ts': 70, 'opens': [1.0]*OHLC_SEQ_LEN, 'closes': [1.1]*OHLC_SEQ_LEN, 'i': '1s'},
+                    {'event_type': 'PoolCreated', # NEW
+                    'timestamp': 1729711270,
+                    'relative_ts': 80,
+                    'wallet_address': 'addrW1',
+                    'protocol_id': vocab.PROTOCOL_TO_ID['Raydium CPMM'],
+                    'quote_token_address': 'tknB',
+                    'base_amount': 1000000.0,
+                    'quote_amount': 10.0
+                    },
+                    {'event_type': 'LiquidityChange', # NEW
+                    'timestamp': 1729711280,
+                    'relative_ts': 90,
+                    'wallet_address': 'addrW2',
+                    'quote_token_address': 'tknB',
+                    'change_type_id': 0, # 0 for 'add'
+                    'quote_amount': 2.0
+                    },
+                    {'event_type': 'FeeCollected', # NEW
+                    'timestamp': 1729711290,
+                    'relative_ts': 100,
+                    'wallet_address': 'addrW1', # The recipient (e.g., dev wallet)
+                    'sol_amount': 0.1
+                    },
+                    {'event_type': 'TokenBurn', # NEW
+                    'timestamp': 1729711300,
+                    'relative_ts': 110,
+                    'wallet_address': 'addrW2', # The burner wallet
+                    'amount_pct_of_total_supply': 0.01, # 1% of supply
+                    'amount_tokens_burned': 10000000.0
+                    },
+                    {'event_type': 'SupplyLock', # NEW
+                    'timestamp': 1729711310,
+                    'relative_ts': 120,
+                    'wallet_address': 'addrW1', # The locker wallet
+                    'amount_pct_of_total_supply': 0.10, # 10% of supply
+                    'lock_duration': 2592000 # 30 days in seconds
+                    },
+                    {'event_type': 'HolderSnapshot', # NEW
+                    'timestamp': 1729711320,
+                    'relative_ts': 130,
+                    # This is a pointer to the pre-computed embedding
+                    # In a real system, this would be the index of the embedding
+                    'holders': [ # Raw holder data
+                        {'wallet': 'addrW1', 'holding_pct': 0.15},
+                        {'wallet': 'addrW2', 'holding_pct': 0.05},
+                        # Add more mock holders if needed
+                    ]
+                    },
+                    {'event_type': 'OnChain_Snapshot', # NEW
+                    'timestamp': 1729711320,
+                    'relative_ts': 130,
+                    'total_holders': 500,
+                    'smart_traders': 25,
+                    'kols': 3,
+                    'holder_growth_rate': 0.15,
+                    'top_10_holder_pct': 0.22,
+                    'sniper_holding_pct': 0.05,
+                    'rat_wallets_holding_pct': 0.02,
+                    'bundle_holding_pct': 0.01,
+                    'current_market_cap': 150000.0,
+                    'volume': 50000.0,
+                    'buy_count': 120,
+                    'sell_count': 80,
+                    'total_txns': 200,
+                    'global_fees_paid': 1.5
+                    },
+                    {'event_type': 'TrendingToken', # NEW
+                    'timestamp': 1729711330,
+                    'relative_ts': 140,
+                    'token_address': 'tknC', # The token that is trending
+                    'list_source_id': vocab.TRENDING_LIST_SOURCE_TO_ID['Phantom'],
+                    'timeframe_id': vocab.TRENDING_LIST_TIMEFRAME_TO_ID['1h'],
+                    'rank': 3
+                    },
+                    {'event_type': 'BoostedToken', # NEW
+                    'timestamp': 1729711340,
+                    'relative_ts': 150,
+                    'token_address': 'tknD', # The token that is boosted
+                    'total_boost_amount': 5000.0,
+                    'rank': 1
+                    },
+                    {'event_type': 'XQuoteTweet', # NEW
+                    'timestamp': 1729711380,
+                    'relative_ts': 190,
+                    'wallet_address': 'addrW3', # The quoter
+                    'quoter_text_emb_idx': pooler.get_idx('Wow, look at this! $TKA'),
+                    'original_author_wallet_address': 'addrW1', # The original author
+                    'original_post_text_emb_idx': pooler.get_idx('This is the main tweet about $TKA'),
+                    'original_post_media_emb_idx': pooler.get_idx(Image.new('RGB', (100,100), color='cyan'))
+                    },
+                    # --- NEW: Add special context tokens ---
+                    {'event_type': 'MIDDLE', 'timestamp': 1729711500, 'relative_ts': 195},
+                    {'event_type': 'PumpReply', # NEW
+                    'timestamp': 1729711390,
+                    'relative_ts': 200,
+                    'wallet_address': 'addrW2', # The user who replied
+                    'reply_text_emb_idx': pooler.get_idx('to the moon!')
+                    },
+                    {'event_type': 'DexBoost_Paid', # NEW
+                    'timestamp': 1729711400,
+                    'relative_ts': 210,
+                    'amount': 5.0, # e.g., 5 Boost
+                    'total_amount_on_token': 25.0 # 25 Boost Points
+                    },
+                    {'event_type': 'DexProfile_Updated', # NEW
+                    'timestamp': 1729711410,
+                    'relative_ts': 220,
+                    'has_changed_website_flag': True,
+                    'has_changed_twitter_flag': False,
+                    'has_changed_telegram_flag': True,
+                    'has_changed_description_flag': True,
+                    # Pre-computed text embeddings
+                    'website_emb_idx': pooler.get_idx('new-token-website.com'),
+                    'twitter_link_emb_idx': pooler.get_idx('old_handle'), # No change, so old link
+                    'telegram_link_emb_idx': pooler.get_idx('new_tg_group'),
+                    'description_emb_idx': pooler.get_idx('This is the new and improved token description.')
+                    },
+                    {'event_type': 'AlphaGroup_Call', # NEW
+                    'timestamp': 1729711420,
+                    'relative_ts': 230,
+                    'group_id': vocab.ALPHA_GROUPS_TO_ID['Potion']
+                    },
+                    {'event_type': 'Channel_Call', # NEW
+                    'timestamp': 1729711430,
+                    'relative_ts': 240,
+                    'channel_id': vocab.CALL_CHANNELS_TO_ID['MarcosCalls']
+                    },
+                    {'event_type': 'RECENT', 'timestamp': 1729711510, 'relative_ts': 245},
+                    {'event_type': 'CexListing', # NEW
+                    'timestamp': 1729711440,
+                    'relative_ts': 250,
+                    'exchange_id': vocab.EXCHANGES_TO_ID['mexc']
+                    },
+                    {'event_type': 'TikTok_Trending_Hashtag', # NEW
+                    'timestamp': 1729711450,
+                    'relative_ts': 260,
+                    'hashtag_name_emb_idx': pooler.get_idx('CryptoTok'),
+                    'rank': 5
+                    },
+                    {'event_type': 'XTrending_Hashtag', # NEW
+                    'timestamp': 1729711460,
+                    'relative_ts': 270,
+                    'hashtag_name_emb_idx': pooler.get_idx('SolanaMemes'),
+                    'rank': 2
+                    },
+                    {'event_type': 'ChainSnapshot', # NEW
+                    'timestamp': 1729711470,
+                    'relative_ts': 280,
+                    'native_token_price_usd': 150.75,
+                    'gas_fee': 0.00015 # Example gas fee
+                    },
+                    {'event_type': 'Lighthouse_Snapshot', # NEW
+                    'timestamp': 1729711480,
+                    'relative_ts': 290,
+                    'protocol_id': vocab.PROTOCOL_TO_ID['Pump V1'],
+                    'timeframe_id': vocab.LIGHTHOUSE_TIMEFRAME_TO_ID['1h'],
+                    'total_volume': 1.2e6,
+                    'total_transactions': 5000,
+                    'total_traders': 1200,
+                    'total_tokens_created': 85,
+                    'total_migrations': 70
+                    },
+                    {'event_type': 'Migrated', # NEW
+                     'timestamp': 1729711490,
+                     'relative_ts': 300,
+                     'protocol_id': vocab.PROTOCOL_TO_ID['Raydium CPMM']
+                    },
+                ],
+                'wallets': {
+                    'addrW1': {'profile': profile1, 'socials': social1, 'holdings': holdings1},
+                    'addrW2': {'profile': profile2, 'socials': social2, 'holdings': holdings2},
+                    # --- NEW: Add wallet 3 data ---
+                    'addrW3': {
+                        'profile': {**profile2, 'wallet_address': 'addrW3'}, # Reuse profile2 but change address
+                        'socials': social3,
+                        'holdings': []
+                    }
+                },
+                'tokens': {
+                    'tknA': tokenA_data, # Main token
+                    'tknB': tokenB_data, # Quote token
+                    'tknC': tokenC_data,  # Trending token
+                    'tknD': tokenD_data   # Boosted token
+                },
+                # --- NEW: The pre-computed embedding pool is generated after collecting all items
+                'embedding_pooler': pooler, # Pass the pooler to generate the tensor later
+                # --- NEW: Expanded graph_links to test all encoders ---
+                # --- FIXED: Removed useless logging fields as per user request ---
+                'graph_links': {
+                    'TransferLink': {'links': [{'timestamp': 1729711205}], 'edges': [('addrW1', 'addrW2')]}, # Keep timestamp
+                    'BundleTradeLink': {'links': [{'timestamp': 1729711215}], 'edges': [('addrW1', 'addrW2')]}, # Keep timestamp
+                    'CopiedTradeLink': {'links': [
+                        {'time_gap_on_buy_sec': 10, 'time_gap_on_sell_sec': 120, 'leader_pnl': 5.0, 'follower_pnl': 4.0, 'follower_buy_total': 100, 'follower_sell_total': 120}
+                        ], 'edges': [('addrW1', 'addrW2')]},
+                    'CoordinatedActivityLink': {'links': [
+                        {'time_gap_on_first_sec': 5, 'time_gap_on_second_sec': 8}
+                        ], 'edges': [('addrW1', 'addrW2')]},
+                    'MintedLink': {'links': [
+                        {'timestamp': 1729711200, 'buy_amount': 1e9}
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'SnipedLink': {'links': [
+                        {'rank': 1, 'sniped_amount': 5e8}
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'LockedSupplyLink': {'links': [
+                        {'amount': 1e10} # Only amount is needed
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'BurnedLink': {'links': [
+                        {'timestamp': 1729711300} # Only timestamp is needed
+                        ], 'edges': [('addrW2', 'tknA')]},
+                    'ProvidedLiquidityLink': {'links': [
+                        {'timestamp': 1729711250} # Only timestamp is needed
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'WhaleOfLink': {'links': [
+                        {} # Just the existence of the link is the feature
+                        ], 'edges': [('addrW1', 'tknA')]},
+                    'TopTraderOfLink': {'links': [
+                        {'pnl_at_creation': 50000.0} # Only PnL is needed
+                        ], 'edges': [('addrW2', 'tknA')]}
+                },
+                # --- FIXED: Removed chart_segments dictionary ---
+                'labels': torch.randn(self.num_outputs) if self.num_outputs > 0 else torch.zeros(0),
+                'labels_mask': torch.ones(self.num_outputs) if self.num_outputs > 0 else torch.zeros(0)
+            }
+        print("Mock raw batch created.")
+        return item

data/data_collator.py ADDED Viewed

	@@ -0,0 +1,708 @@

+# memecoin_collator.py (CORRECTED ORDER OF OPERATIONS)
+import torch
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
+from typing import List, Dict, Any, Tuple, Optional, Union
+from collections import defaultdict
+from PIL import Image
+from models.multi_modal_processor import MultiModalEncoder
+# Encoders are NO LONGER imported here
+import models.vocabulary as vocab # For IDs, config sizes
+from data.data_loader import EmbeddingPooler # Import for type hinting and instantiation
+NATIVE_MINT = "So11111111111111111111111111111111111111112"
+QUOTE_MINTS = {
+    NATIVE_MINT,  # SOL
+    "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v",  # USDC
+    "Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB",  # USDT
+    "USD1ttGY1N17NEEHLmELoaybftRBUSErhqYiQzvEmuB",   # USD1
+}
+class MemecoinCollator:
+    """
+    Callable class for PyTorch DataLoader's collate_fn.
+    ... (rest of docstring) ...
+    """
+    def __init__(self,
+                 event_type_to_id: Dict[str, int],
+                 device: torch.device,
+                 multi_modal_encoder: MultiModalEncoder,
+                 dtype: torch.dtype,
+                 ohlc_seq_len: int = 300,
+                 max_seq_len: Optional[int] = None
+                ):
+        self.event_type_to_id = event_type_to_id
+        self.pad_token_id = event_type_to_id.get('__PAD__', 0)
+        self.multi_modal_encoder = multi_modal_encoder
+        self.entity_pad_idx = 0
+        self.device = device
+        self.dtype = dtype
+        self.ohlc_seq_len = ohlc_seq_len
+        self.max_seq_len = max_seq_len
+    def _collate_features_for_encoder(self, entities: List[Dict], feature_keys: List[str], device: torch.device, entity_type: str) -> Dict[str, Any]:
+        """ (Unchanged) """
+        collated = defaultdict(list)
+        if not entities:
+            # --- FIXED: Return a default empty structure for BOTH tokens and wallets ---
+            if entity_type == "token":
+                return {
+                    'name_embed_indices': torch.tensor([], device=device, dtype=torch.long),
+                    'symbol_embed_indices': torch.tensor([], device=device, dtype=torch.long),
+                    'image_embed_indices': torch.tensor([], device=device, dtype=torch.long),
+                    'protocol_ids': torch.tensor([], device=device, dtype=torch.long),
+                    'is_vanity_flags': torch.tensor([], device=device, dtype=torch.bool),
+                    '_addresses_for_lookup': []
+                }
+            elif entity_type == "wallet":
+                return {
+                    'username_embed_indices': torch.tensor([], device=device, dtype=torch.long),
+                    'profile_rows': [], 'social_rows': [], 'holdings_batch': []
+                }
+            return {} # Should not happen
+        # NEW: We now gather indices to pre-computed embeddings
+        if entity_type == "token":
+            # This indicates a Token entity
+            # Helper key for WalletEncoder to find token vibes
+            collated['_addresses_for_lookup'] = [e.get('address', '') for e in entities]
+            collated['name_embed_indices'] = torch.tensor([e.get('name_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
+            collated['symbol_embed_indices'] = torch.tensor([e.get('symbol_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
+            collated['image_embed_indices'] = torch.tensor([e.get('image_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
+            collated['protocol_ids'] = torch.tensor([e.get('protocol', 0) for e in entities], device=device, dtype=torch.long)
+            collated['is_vanity_flags'] = torch.tensor([e.get('is_vanity', False) for e in entities], device=device, dtype=torch.bool)
+        elif entity_type == "wallet":
+             # NEW: Gather username indices for WalletEncoder
+             collated['username_embed_indices'] = torch.tensor([e.get('socials', {}).get('username_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
+             collated['profile_rows'] = [e.get('profile', {}) for e in entities]
+             collated['social_rows'] = [e.get('socials', {}) for e in entities]
+             collated['holdings_batch'] = [e.get('holdings', []) for e in entities]
+        return dict(collated)
+    def _collate_ohlc_inputs(self, chart_events: List[Dict]) -> Dict[str, torch.Tensor]:
+        """ (Unchanged from previous correct version) """
+        if not chart_events:
+            return {
+                'price_tensor': torch.empty(0, 2, self.ohlc_seq_len, device=self.device, dtype=self.dtype),
+                'interval_ids': torch.empty(0, device=self.device, dtype=torch.long)
+            }
+        ohlc_tensors = []
+        interval_ids_list = []
+        seq_len = self.ohlc_seq_len
+        unknown_id = vocab.INTERVAL_TO_ID.get("Unknown", 0)
+        for segment_data in chart_events:
+             opens = segment_data.get('opens', [])
+             closes = segment_data.get('closes', [])
+             interval_str = segment_data.get('i', "Unknown")
+             pad_open = opens[-1] if opens else 0
+             pad_close = closes[-1] if closes else 0
+             o = torch.tensor(opens[:seq_len] + [pad_open]*(seq_len-len(opens)), dtype=self.dtype)
+             c = torch.tensor(closes[:seq_len] + [pad_close]*(seq_len-len(closes)), dtype=self.dtype)
+             ohlc_tensors.append(torch.stack([o, c]))
+             interval_id = vocab.INTERVAL_TO_ID.get(interval_str, unknown_id)
+             interval_ids_list.append(interval_id)
+        return {
+            'price_tensor': torch.stack(ohlc_tensors).to(self.device),
+            'interval_ids': torch.tensor(interval_ids_list, device=self.device, dtype=torch.long)
+        }
+    def _collate_graph_links(self,
+                             batch_items: List[Dict],
+                             wallet_addr_to_batch_idx: Dict[str, int],
+                             token_addr_to_batch_idx: Dict[str, int]) -> Dict[str, Any]:
+        """ (Unchanged) """
+        aggregated_links = defaultdict(lambda: {'edge_index_list': [], 'links_list': []})
+        for item in batch_items:
+            item_wallets = item.get('wallets', {})
+            item_tokens = item.get('tokens', {})
+            item_wallet_addr_to_global_idx = {addr: wallet_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_wallets.keys()}
+            item_token_addr_to_global_idx = {addr: token_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_tokens.keys()}
+            for link_name, data in item.get('graph_links', {}).items():
+                aggregated_links[link_name]['links_list'].extend(data.get('links', []))
+                triplet = vocab.LINK_NAME_TO_TRIPLET.get(link_name)
+                if not triplet: continue
+                src_type, _, dst_type = triplet
+                edges = data.get('edges')
+                if not edges: continue
+                src_map = item_wallet_addr_to_global_idx if src_type == 'wallet' else item_token_addr_to_global_idx
+                dst_map = item_wallet_addr_to_global_idx if dst_type == 'wallet' else item_token_addr_to_global_idx
+                remapped_edge_list = []
+                for src_addr, dst_addr in edges:
+                    src_idx_global = src_map.get(src_addr, self.entity_pad_idx)
+                    dst_idx_global = dst_map.get(dst_addr, self.entity_pad_idx)
+                    if src_idx_global != self.entity_pad_idx and dst_idx_global != self.entity_pad_idx:
+                        remapped_edge_list.append([src_idx_global, dst_idx_global])
+                if remapped_edge_list:
+                    remapped_edge_tensor = torch.tensor(remapped_edge_list, device=self.device, dtype=torch.long).t()
+                    aggregated_links[link_name]['edge_index_list'].append(remapped_edge_tensor)
+                if link_name == "TransferLink":
+                    link_props = data.get('links', [])
+                    derived_edges = []
+                    derived_props = []
+                    for (src_addr, dst_addr), props in zip(edges, link_props):
+                        mint_addr = props.get('mint')
+                        if not mint_addr or mint_addr in QUOTE_MINTS:
+                            continue
+                        token_idx_global = item_token_addr_to_global_idx.get(mint_addr, self.entity_pad_idx)
+                        if token_idx_global == self.entity_pad_idx:
+                            continue
+                        for wallet_addr in (src_addr, dst_addr):
+                            wallet_idx_global = item_wallet_addr_to_global_idx.get(wallet_addr, self.entity_pad_idx)
+                            if wallet_idx_global == self.entity_pad_idx:
+                                continue
+                            derived_edges.append([wallet_idx_global, token_idx_global])
+                            derived_props.append(props)
+                    if derived_edges:
+                        derived_tensor = torch.tensor(derived_edges, device=self.device, dtype=torch.long).t()
+                        aggregated_links["TransferLinkToken"]['edge_index_list'].append(derived_tensor)
+                        aggregated_links["TransferLinkToken"]['links_list'].extend(derived_props)
+        final_links_dict = {}
+        for link_name, data in aggregated_links.items():
+            if data['edge_index_list']:
+                final_links_dict[link_name] = {
+                    'links': data['links_list'],
+                    'edge_index': torch.cat(data['edge_index_list'], dim=1)
+                }
+        return final_links_dict
+    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Processes a batch of raw data items into tensors for the model.
+        """
+        # --- NEW ARCHITECTURE ---
+        # 1. Aggregate all unique embeddable items from the entire batch.
+        # 2. Create a single embedding pool tensor for the whole batch.
+        # 3. Create a mapping from original (per-item) indices to the new batch-wide indices.
+        # 4. Remap all `_emb_idx` fields in the batch data using this new mapping.
+        batch_size = len(batch)
+        if batch_size == 0:
+            return {}
+        # --- 1. Aggregate all unique items and create index mappings ---
+        batch_wide_pooler = EmbeddingPooler()
+        # Map to translate from an item's original pooler to the new batch-wide indices
+        # Format: { batch_item_index: { original_idx: new_batch_idx } }
+        idx_remap = defaultdict(dict)
+        for i, item in enumerate(batch):
+            pooler = item.get('embedding_pooler')
+            if not pooler: continue
+            for pool_item_data in pooler.get_all_items():
+                original_idx = pool_item_data['idx']
+                raw_item = pool_item_data['item']
+                # get_idx will either return an existing index or create a new one
+                # --- FIX: Convert 1-based pooler index to 0-based tensor index ---
+                new_batch_idx_1_based = batch_wide_pooler.get_idx(raw_item)
+                new_batch_idx_0_based = new_batch_idx_1_based - 1
+                idx_remap[i][original_idx] = new_batch_idx_0_based
+        # --- 2. Create the single, batch-wide embedding pool tensor ---
+        all_items_sorted = batch_wide_pooler.get_all_items()
+        texts_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], str)]
+        images_to_encode = [d['item'] for d in all_items_sorted if isinstance(d['item'], Image.Image)]
+        text_embeds = self.multi_modal_encoder(texts_to_encode) if texts_to_encode else torch.empty(0)
+        image_embeds = self.multi_modal_encoder(images_to_encode) if images_to_encode else torch.empty(0)
+        # Create the final lookup tensor and fill it based on original item type
+        batch_embedding_pool = torch.zeros(len(all_items_sorted), self.multi_modal_encoder.embedding_dim, device=self.device, dtype=self.dtype)
+        text_cursor, image_cursor = 0, 0
+        for i, item_data in enumerate(all_items_sorted):
+            if isinstance(item_data['item'], str):
+                if text_embeds.numel() > 0:
+                    batch_embedding_pool[i] = text_embeds[text_cursor]
+                    text_cursor += 1
+            elif isinstance(item_data['item'], Image.Image):
+                if image_embeds.numel() > 0:
+                    batch_embedding_pool[i] = image_embeds[image_cursor]
+                    image_cursor += 1
+        # --- 3. Remap all indices in the batch data ---
+        for i, item in enumerate(batch):
+            remap_dict = idx_remap.get(i, {})
+            if not remap_dict: continue
+            # Remap tokens
+            for token_data in item.get('tokens', {}).values():
+                for key in ['name_emb_idx', 'symbol_emb_idx', 'image_emb_idx']:
+                    if token_data.get(key, 0) > 0: # Check if it has a valid 1-based index
+                        token_data[key] = remap_dict.get(token_data[key], -1) # Remap to 0-based, default to -1 if not found
+            # Remap wallets
+            for wallet_data in item.get('wallets', {}).values():
+                socials = wallet_data.get('socials', {})
+                if socials.get('username_emb_idx', 0) > 0:
+                    socials['username_emb_idx'] = remap_dict.get(socials['username_emb_idx'], -1)
+            # Remap events
+            for event in item.get('event_sequence', []):
+                for key in event:
+                    if key.endswith('_emb_idx') and event.get(key, 0) > 0:
+                        event[key] = remap_dict.get(event[key], 0)
+        # --- 4. Standard Collation (Now that indices are correct) ---
+        unique_wallets_data = {}
+        unique_tokens_data = {}
+        all_event_sequences = []
+        max_len = 0
+        for item in batch:
+            seq = item.get('event_sequence', [])
+            if self.max_seq_len is not None and len(seq) > self.max_seq_len:
+                seq = seq[:self.max_seq_len]
+            all_event_sequences.append(seq)
+            max_len = max(max_len, len(seq))
+            unique_wallets_data.update(item.get('wallets', {}))
+            unique_tokens_data.update(item.get('tokens', {}))
+        # Create mappings needed for indexing
+        wallet_list_data = list(unique_wallets_data.values())
+        token_list_data = list(unique_tokens_data.values())
+        wallet_addr_to_batch_idx = {feat.get('profile', {}).get('wallet_address', f'__error_{i}'): i+1 for i, feat in enumerate(wallet_list_data)}
+        token_addr_to_batch_idx = {feat.get('address', f'__error_{i}'): i+1 for i, feat in enumerate(token_list_data)}
+        # Collate Static Raw Features (Tokens, Wallets, Graph)
+        token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
+        wallet_encoder_inputs = self._collate_features_for_encoder(wallet_list_data, ['profile'], self.device, "wallet")
+        graph_updater_links = self._collate_graph_links(batch, wallet_addr_to_batch_idx, token_addr_to_batch_idx)
+        # --- Logging ---
+        pool_contents = batch_wide_pooler.get_all_items()
+        print(f"\n[DataCollator: Final Embedding Pool] ({len(pool_contents)} items):")
+        if pool_contents:
+            for item_data in pool_contents:
+                sample_item = item_data['item']
+                sample_type = "Image" if isinstance(sample_item, Image.Image) else "Text"
+                content_preview = str(sample_item)
+                if sample_type == "Text" and len(content_preview) > 100:
+                    content_preview = content_preview[:97] + "..."
+                print(f"  - Item (Original Idx {item_data['idx']}): Type='{sample_type}', Content='{content_preview}'")
+        # --- 5. Prepare Sequence Tensors & Collect Dynamic Data (OHLC) ---
+        B = batch_size
+        L = max_len
+        PAD_IDX_SEQ = self.pad_token_id
+        PAD_IDX_ENT = self.entity_pad_idx
+        # Initialize sequence tensors
+        event_type_ids = torch.full((B, L), PAD_IDX_SEQ, dtype=torch.long, device=self.device)
+        timestamps_float = torch.zeros((B, L), dtype=torch.float32, device=self.device)
+        # Store relative_ts in float32 for stability; model will scale/log/normalize
+        relative_ts = torch.zeros((B, L, 1), dtype=torch.float32, device=self.device)
+        attention_mask = torch.zeros((B, L), dtype=torch.long, device=self.device)
+        wallet_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        ohlc_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        quote_token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device) # NEW
+        # --- NEW: Tensors for Transfer/LargeTransfer ---
+        dest_wallet_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        # --- NEW: Separate tensor for social media original authors ---
+        original_author_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        # 4 numerical features for transfers
+        transfer_numerical_features = torch.zeros((B, L, 4), dtype=self.dtype, device=self.device)
+        # --- NEW: Tensors for Trade ---
+        # --- FIXED: Size reduced from 10 to 8 ---
+        trade_numerical_features = torch.zeros((B, L, 8), dtype=self.dtype, device=self.device)
+        deployer_trade_numerical_features = torch.zeros((B, L, 8), dtype=self.dtype, device=self.device)
+        smart_wallet_trade_numerical_features = torch.zeros((B, L, 8), dtype=self.dtype, device=self.device)
+        # --- NEW: Dedicated tensor for categorical dex_platform_id ---
+        trade_dex_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Dedicated tensor for categorical trade_direction ---
+        trade_direction_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Dedicated tensor for categorical mev_protection ---
+        trade_mev_protection_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Dedicated tensor for categorical is_bundle ---
+        trade_is_bundle_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for PoolCreated ---
+        # --- UPDATED: Capture raw base/quote deposit amounts only ---
+        pool_created_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device)
+        # --- NEW: Dedicated tensor for categorical protocol_id ---
+        pool_created_protocol_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for LiquidityChange ---
+        # --- UPDATED: Keep only the raw quote amount deposit/withdraw ---
+        liquidity_change_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device)
+        # --- NEW: Dedicated tensor for categorical change_type_id ---
+        liquidity_change_type_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for FeeCollected ---
+        fee_collected_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device) # sol_amount only
+        # --- NEW: Tensors for TokenBurn ---
+        token_burn_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # amount_pct, amount_tokens
+        # --- NEW: Tensors for SupplyLock ---
+        supply_lock_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # amount_pct, lock_duration
+        # --- NEW: Tensors for OnChain_Snapshot ---
+        onchain_snapshot_numerical_features = torch.zeros((B, L, 14), dtype=self.dtype, device=self.device)
+        # --- NEW: Tensors for TrendingToken ---
+        trending_token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        # --- FIXED: Size reduced from 3 to 1 after removing IDs ---
+        trending_token_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device) # rank
+        trending_token_source_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        trending_token_timeframe_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for BoostedToken ---
+        boosted_token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        boosted_token_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # total_boost_amount, rank
+        # --- NEW: Tensors for DexBoost_Paid ---
+        dexboost_paid_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # amount, total_amount_on_token
+        # --- NEW: Tensors for DexProfile_Updated ---
+        dexprofile_updated_flags = torch.zeros((B, L, 4), dtype=torch.float32, device=self.device) # Using float for easier projection
+        # --- NEW: Tensors for Tracker Events ---
+        alpha_group_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        channel_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        exchange_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for GlobalTrending Events ---
+        global_trending_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device) # rank
+        # --- NEW: Tensors for ChainSnapshot ---
+        chainsnapshot_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # native_token_price_usd, gas_fee
+        # --- NEW: Tensors for Lighthouse_Snapshot ---
+        # --- FIXED: Size reduced from 7 to 5 after removing IDs ---
+        lighthousesnapshot_numerical_features = torch.zeros((B, L, 5), dtype=self.dtype, device=self.device)
+        lighthousesnapshot_protocol_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        lighthousesnapshot_timeframe_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for Migrated event ---
+        migrated_protocol_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
+        # --- NEW: Tensors for HolderSnapshot ---
+        # This will store the raw holder data for the Oracle to process
+        holder_snapshot_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        holder_snapshot_raw_data_list = [] # List of lists of dicts
+        # --- RENAMED: Generic tensors for any event with text/image features ---
+        textual_event_data_list = [] # List of dicts with text/media indices
+        textual_event_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        # --- NEW: Pointers for pre-encoded images ---
+        image_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        original_post_image_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
+        # --- CORRECTED: Initialize chart event collection here ---
+        batch_chart_events = []
+        chart_event_counter = 0
+        # Loop through sequences to populate tensors and collect chart events
+        for i, seq in enumerate(all_event_sequences):
+            seq_len = len(seq)
+            if seq_len == 0: continue
+            attention_mask[i, :seq_len] = 1
+            for j, event in enumerate(seq):
+                # Populate basic sequence info
+                event_type = event.get('event_type', '__PAD__')
+                type_id = self.event_type_to_id.get(event_type, PAD_IDX_SEQ)
+                event_type_ids[i, j] = type_id
+                timestamps_float[i, j] = event.get('timestamp', 0)
+                relative_ts[i, j, 0] = event.get('relative_ts', 0.0)
+                # Populate pointer indices
+                w_addr = event.get('wallet_address')
+                if w_addr:
+                     wallet_indices[i, j] = wallet_addr_to_batch_idx.get(w_addr, PAD_IDX_ENT)
+                t_addr = event.get('token_address')
+                if t_addr:
+                     token_indices[i, j] = token_addr_to_batch_idx.get(t_addr, PAD_IDX_ENT)
+                # If it's a chart event, collect it and record its index
+                if event_type == 'Chart_Segment':
+                    batch_chart_events.append(event)
+                    ohlc_indices[i, j] = chart_event_counter + 1 # Use 1-based index
+                    chart_event_counter += 1
+                elif event_type in ['Transfer', 'LargeTransfer']: # ADDED LargeTransfer
+                    # Get destination wallet index
+                    dest_w_addr = event.get('destination_wallet_address') # Assuming this key exists
+                    if dest_w_addr:
+                        dest_wallet_indices[i, j] = wallet_addr_to_batch_idx.get(dest_w_addr, PAD_IDX_ENT)
+                    # Get numerical features (use .get with default 0)
+                    num_feats = [
+                        event.get('token_amount', 0.0),
+                        event.get('transfer_pct_of_total_supply', 0.0),
+                        event.get('transfer_pct_of_holding', 0.0),
+                        event.get('priority_fee', 0.0)
+                    ]
+                    transfer_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type in ['Trade', 'LargeTrade']:
+                    # Get numerical and categorical features for the trade
+                    trade_dex_ids[i, j] = event.get('dex_platform_id', 0)
+                    trade_direction_ids[i, j] = event.get('trade_direction', 0) # 0=buy, 1=sell
+                    trade_mev_protection_ids[i, j] = event.get('mev_protection', 0) # 0, 1, 2...
+                    trade_is_bundle_ids[i, j] = 1 if event.get('is_bundle') else 0 # 0=false, 1=true
+                    num_feats = [
+                        event.get('sol_amount', 0.0),
+                        event.get('priority_fee', 0.0),
+                        event.get('token_amount_pct_of_holding', 0.0),
+                        event.get('quote_amount_pct_of_holding', 0.0),
+                        event.get('slippage', 0.0),
+                        event.get('token_amount_pct_to_total_supply', 0.0), # REPLACED price_impact
+                        1.0 if event.get('success') else 0.0,
+                        event.get('total_usd', 0.0)
+                    ]
+                    trade_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'Deployer_Trade':
+                    # Use the dedicated tensor for deployer trades
+                    trade_dex_ids[i, j] = event.get('dex_platform_id', 0)
+                    trade_direction_ids[i, j] = event.get('trade_direction', 0) # 0=buy, 1=sell
+                    trade_mev_protection_ids[i, j] = event.get('mev_protection', 0) # 0, 1, 2...
+                    trade_is_bundle_ids[i, j] = 1 if event.get('is_bundle') else 0 # 0=false, 1=true
+                    num_feats = [
+                        event.get('sol_amount', 0.0),
+                        event.get('priority_fee', 0.0),
+                        event.get('token_amount_pct_of_holding', 0.0),
+                        event.get('quote_amount_pct_of_holding', 0.0),
+                        event.get('slippage', 0.0),
+                        event.get('token_amount_pct_to_total_supply', 0.0), # REPLACED price_impact
+                        1.0 if event.get('success') else 0.0,
+                        event.get('total_usd', 0.0)
+                    ]
+                    deployer_trade_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'SmartWallet_Trade':
+                    # Use the dedicated tensor for smart wallet trades
+                    trade_dex_ids[i, j] = event.get('dex_platform_id', 0)
+                    trade_direction_ids[i, j] = event.get('trade_direction', 0) # 0=buy, 1=sell
+                    trade_mev_protection_ids[i, j] = event.get('mev_protection', 0) # 0, 1, 2...
+                    trade_is_bundle_ids[i, j] = 1 if event.get('is_bundle') else 0 # 0=false, 1=true
+                    num_feats = [
+                        event.get('sol_amount', 0.0),
+                        event.get('priority_fee', 0.0),
+                        event.get('token_amount_pct_of_holding', 0.0),
+                        event.get('quote_amount_pct_of_holding', 0.0),
+                        event.get('slippage', 0.0),
+                        event.get('token_amount_pct_to_total_supply', 0.0), # REPLACED price_impact
+                        1.0 if event.get('success') else 0.0,
+                        event.get('total_usd', 0.0)
+                    ]
+                    smart_wallet_trade_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'PoolCreated':
+                    # Get the quote token index
+                    quote_t_addr = event.get('quote_token_address')
+                    if quote_t_addr:
+                        quote_token_indices[i, j] = token_addr_to_batch_idx.get(quote_t_addr, PAD_IDX_ENT)
+                    pool_created_protocol_ids[i, j] = event.get('protocol_id', 0)
+                    # Get numerical features
+                    num_feats = [
+                        event.get('base_amount', 0.0),
+                        event.get('quote_amount', 0.0)
+                    ]
+                    pool_created_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'LiquidityChange':
+                    # Get the quote token index
+                    quote_t_addr = event.get('quote_token_address')
+                    if quote_t_addr:
+                        quote_token_indices[i, j] = token_addr_to_batch_idx.get(quote_t_addr, PAD_IDX_ENT)
+                    liquidity_change_type_ids[i, j] = event.get('change_type_id', 0)
+                    # Get numerical features
+                    num_feats = [event.get('quote_amount', 0.0)]
+                    liquidity_change_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'FeeCollected':
+                    # This event has the recipient wallet plus a single numerical feature (SOL amount).
+                    num_feats = [
+                        event.get('sol_amount', 0.0)
+                    ]
+                    fee_collected_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'TokenBurn':
+                    # This event has a wallet (handled by wallet_indices) and two numerical features.
+                    num_feats = [
+                        event.get('amount_pct_of_total_supply', 0.0),
+                        event.get('amount_tokens_burned', 0.0)
+                    ]
+                    token_burn_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'SupplyLock':
+                    # This event has a wallet and two numerical features.
+                    num_feats = [
+                        event.get('amount_pct_of_total_supply', 0.0),
+                        event.get('lock_duration', 0.0)
+                    ]
+                    supply_lock_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'OnChain_Snapshot':
+                    # This event is a global snapshot with 14 numerical features.
+                    num_feats = [
+                        event.get('total_holders', 0.0),
+                        event.get('smart_traders', 0.0),
+                        event.get('kols', 0.0),
+                        event.get('holder_growth_rate', 0.0),
+                        event.get('top_10_holder_pct', 0.0),
+                        event.get('sniper_holding_pct', 0.0),
+                        event.get('rat_wallets_holding_pct', 0.0),
+                        event.get('bundle_holding_pct', 0.0),
+                        event.get('current_market_cap', 0.0),
+                        event.get('volume', 0.0),
+                        event.get('buy_count', 0.0),
+                        event.get('sell_count', 0.0),
+                        event.get('total_txns', 0.0),
+                        event.get('global_fees_paid', 0.0)
+                    ]
+                    onchain_snapshot_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'TrendingToken':
+                    # Get the trending token index
+                    trending_t_addr = event.get('token_address')
+                    if trending_t_addr:
+                        trending_token_indices[i, j] = token_addr_to_batch_idx.get(trending_t_addr, PAD_IDX_ENT)
+                    trending_token_source_ids[i, j] = event.get('list_source_id', 0)
+                    trending_token_timeframe_ids[i, j] = event.get('timeframe_id', 0)
+                    # --- FIXED: Invert rank so that 1 is the highest value ---
+                    # Get numerical/categorical features
+                    num_feats = [
+                        1.0 / event.get('rank', 1e9) # Use a large number for rank 0 or missing to make it small
+                    ]
+                    trending_token_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'BoostedToken':
+                    # Get the boosted token index
+                    boosted_t_addr = event.get('token_address')
+                    if boosted_t_addr:
+                        boosted_token_indices[i, j] = token_addr_to_batch_idx.get(boosted_t_addr, PAD_IDX_ENT)
+                    # --- FIXED: Invert rank so that 1 is the highest value ---
+                    # Get numerical features
+                    num_feats = [
+                        event.get('total_boost_amount', 0.0),
+                        1.0 / event.get('rank', 1e9)
+                    ]
+                    boosted_token_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                elif event_type == 'HolderSnapshot':
+                    # --- FIXED: Store raw holder data, not an index ---
+                    raw_holders = event.get('holders', [])
+                    holder_snapshot_raw_data_list.append(raw_holders)
+                    holder_snapshot_indices[i, j] = len(holder_snapshot_raw_data_list) # 1-based index to the list
+                elif event_type == 'Lighthouse_Snapshot':
+                    lighthousesnapshot_protocol_ids[i, j] = event.get('protocol_id', 0)
+                    lighthousesnapshot_timeframe_ids[i, j] = event.get('timeframe_id', 0)
+                    num_feats = [
+                        event.get('total_volume', 0.0),
+                        event.get('total_transactions', 0.0),
+                        event.get('total_traders', 0.0),
+                        event.get('total_tokens_created', 0.0),
+                        event.get('total_migrations', 0.0)
+                    ]
+                    lighthousesnapshot_numerical_features[i, j, :] = torch.tensor(num_feats, dtype=self.dtype)
+                # --- UPDATED: Group all events that contain pre-computed text/image indices ---
+                elif event_type in ['XPost', 'XReply', 'XRetweet', 'XQuoteTweet', 'PumpReply', 'DexProfile_Updated', 'TikTok_Trending_Hashtag', 'XTrending_Hashtag']:
+                    # Store raw event data to look up text/image indices later
+                    # 1. Store raw text/media data
+                    textual_event_data_list.append(event)
+                    textual_event_indices[i, j] = len(textual_event_data_list) # 1-based index
+                    # --- FIXED: Handle rank for trending hashtags ---
+                    if event_type in ['TikTok_Trending_Hashtag', 'XTrending_Hashtag']:
+                        global_trending_numerical_features[i, j, 0] = 1.0 / event.get('rank', 1e9)
+                    # 2. Populate wallet pointer tensors based on the event type
+                    # The main 'wallet_address' is already handled above.
+                    # Here we handle the *other* wallets involved.
+                    if event_type == 'XRetweet' or event_type == 'XQuoteTweet':
+                        orig_author_addr = event.get('original_author_wallet_address')
+                        if orig_author_addr:
+                            # --- FIXED: Use the dedicated tensor for original authors ---
+                            original_author_indices[i, j] = wallet_addr_to_batch_idx.get(orig_author_addr, PAD_IDX_ENT)
+                    # The pre-computed embedding indices are already in the event dict.
+                    # No need to populate image_indices here anymore.
+                    # For XReply, the main tweet is a text/media embedding, not a wallet.
+                    # For XPost, there's only one wallet, already handled.
+        # --- 4. Collate Dynamic Features (OHLC) AFTER collecting them ---
+        ohlc_inputs_dict = self._collate_ohlc_inputs(batch_chart_events)
+        # --- 6. Prepare final output dictionary ---
+        collated_batch = {
+            # Sequence Tensors
+            'event_type_ids': event_type_ids,
+            'timestamps_float': timestamps_float,
+            'relative_ts': relative_ts,
+            'attention_mask': attention_mask,
+            # Pointer Tensors
+            'wallet_indices': wallet_indices,
+            'token_indices': token_indices,
+            'quote_token_indices': quote_token_indices, # NEW
+            'trending_token_indices': trending_token_indices, # NEW
+            'boosted_token_indices': boosted_token_indices, # NEW
+            'holder_snapshot_indices': holder_snapshot_indices, # This now points to the generated embeddings
+            'textual_event_indices': textual_event_indices, # RENAMED
+            'ohlc_indices': ohlc_indices,
+            # Raw Data for Encoders
+            'embedding_pool': batch_embedding_pool, # NEW
+            'token_encoder_inputs': token_encoder_inputs,
+            'wallet_encoder_inputs': wallet_encoder_inputs, # ADDED BACK
+            'ohlc_price_tensors': ohlc_inputs_dict['price_tensor'],
+            'ohlc_interval_ids': ohlc_inputs_dict['interval_ids'],
+            'graph_updater_links': graph_updater_links,
+            'wallet_addr_to_batch_idx': wallet_addr_to_batch_idx, # NEW: Pass the mapping
+            'dest_wallet_indices': dest_wallet_indices, # ADDED THIS LINE
+            'original_author_indices': original_author_indices, # NEW
+            # --- NEW: Numerical Features for Events ---
+            'transfer_numerical_features': transfer_numerical_features,
+            'trade_numerical_features': trade_numerical_features,
+            'trade_dex_ids': trade_dex_ids,
+            'deployer_trade_numerical_features': deployer_trade_numerical_features,
+            'trade_direction_ids': trade_direction_ids, # NEW
+            'trade_mev_protection_ids': trade_mev_protection_ids, # NEW
+            'smart_wallet_trade_numerical_features': smart_wallet_trade_numerical_features,
+            'trade_is_bundle_ids': trade_is_bundle_ids, # NEW
+            'pool_created_numerical_features': pool_created_numerical_features,
+            'pool_created_protocol_ids': pool_created_protocol_ids, # NEW
+            'liquidity_change_numerical_features': liquidity_change_numerical_features,
+            'liquidity_change_type_ids': liquidity_change_type_ids, # NEW
+            'fee_collected_numerical_features': fee_collected_numerical_features, # NEW
+            'token_burn_numerical_features': token_burn_numerical_features, # NEW
+            'supply_lock_numerical_features': supply_lock_numerical_features, # NEW
+            'onchain_snapshot_numerical_features': onchain_snapshot_numerical_features, # NEW
+            'boosted_token_numerical_features': boosted_token_numerical_features,
+            'trending_token_numerical_features': trending_token_numerical_features,
+            'trending_token_source_ids': trending_token_source_ids, # NEW
+            'trending_token_timeframe_ids': trending_token_timeframe_ids, # NEW
+            'dexboost_paid_numerical_features': dexboost_paid_numerical_features, # NEW
+            'dexprofile_updated_flags': dexprofile_updated_flags, # NEW,
+            'global_trending_numerical_features': global_trending_numerical_features, # NEW
+            'chainsnapshot_numerical_features': chainsnapshot_numerical_features, # NEW
+            'lighthousesnapshot_numerical_features': lighthousesnapshot_numerical_features,
+            'lighthousesnapshot_protocol_ids': lighthousesnapshot_protocol_ids, # NEW
+            'lighthousesnapshot_timeframe_ids': lighthousesnapshot_timeframe_ids, # NEW
+            'migrated_protocol_ids': migrated_protocol_ids, # NEW
+            'alpha_group_ids': alpha_group_ids, # NEW
+            'channel_ids': channel_ids, # NEW
+            'exchange_ids': exchange_ids, # NEW
+            'holder_snapshot_raw_data': holder_snapshot_raw_data_list, # NEW: Raw data for end-to-end processing
+            'textual_event_data': textual_event_data_list, # RENAMED
+            # Labels
+            'labels': torch.stack([item['labels'] for item in batch]) if batch and 'labels' in batch[0] else None,
+            'labels_mask': torch.stack([item['labels_mask'] for item in batch]) if batch and 'labels_mask' in batch[0] else None
+        }
+        # Filter out None values (e.g., if no labels provided)
+        return {k: v for k, v in collated_batch.items() if v is not None}

data/data_fetcher.py ADDED Viewed

	@@ -0,0 +1,1009 @@

+# data_fetcher.py
+from typing import List, Dict, Any, Tuple, Set
+from collections import defaultdict
+import datetime, time
+# We need the vocabulary for mapping IDs
+import models.vocabulary as vocab
+class DataFetcher:
+    """
+    A dedicated class to handle all database queries for ClickHouse and Neo4j.
+    This keeps data fetching logic separate from the dataset and model.
+    """
+    # --- Explicit column definitions for wallet profile & social fetches ---
+    PROFILE_BASE_COLUMNS = [
+        'wallet_address',
+        'updated_at',
+        'first_seen_ts',
+        'last_seen_ts',
+        'tags',
+        'deployed_tokens',
+        'funded_from',
+        'funded_timestamp',
+        'funded_signature',
+        'funded_amount'
+    ]
+    PROFILE_METRIC_COLUMNS = [
+        'balance',
+        'transfers_in_count',
+        'transfers_out_count',
+        'spl_transfers_in_count',
+        'spl_transfers_out_count',
+        'total_buys_count',
+        'total_sells_count',
+        'total_winrate',
+        'stats_1d_realized_profit_sol',
+        'stats_1d_realized_profit_usd',
+        'stats_1d_realized_profit_pnl',
+        'stats_1d_buy_count',
+        'stats_1d_sell_count',
+        'stats_1d_transfer_in_count',
+        'stats_1d_transfer_out_count',
+        'stats_1d_avg_holding_period',
+        'stats_1d_total_bought_cost_sol',
+        'stats_1d_total_bought_cost_usd',
+        'stats_1d_total_sold_income_sol',
+        'stats_1d_total_sold_income_usd',
+        'stats_1d_total_fee',
+        'stats_1d_winrate',
+        'stats_1d_tokens_traded',
+        'stats_7d_realized_profit_sol',
+        'stats_7d_realized_profit_usd',
+        'stats_7d_realized_profit_pnl',
+        'stats_7d_buy_count',
+        'stats_7d_sell_count',
+        'stats_7d_transfer_in_count',
+        'stats_7d_transfer_out_count',
+        'stats_7d_avg_holding_period',
+        'stats_7d_total_bought_cost_sol',
+        'stats_7d_total_bought_cost_usd',
+        'stats_7d_total_sold_income_sol',
+        'stats_7d_total_sold_income_usd',
+        'stats_7d_total_fee',
+        'stats_7d_winrate',
+        'stats_7d_tokens_traded',
+        'stats_30d_realized_profit_sol',
+        'stats_30d_realized_profit_usd',
+        'stats_30d_realized_profit_pnl',
+        'stats_30d_buy_count',
+        'stats_30d_sell_count',
+        'stats_30d_transfer_in_count',
+        'stats_30d_transfer_out_count',
+        'stats_30d_avg_holding_period',
+        'stats_30d_total_bought_cost_sol',
+        'stats_30d_total_bought_cost_usd',
+        'stats_30d_total_sold_income_sol',
+        'stats_30d_total_sold_income_usd',
+        'stats_30d_total_fee',
+        'stats_30d_winrate',
+        'stats_30d_tokens_traded'
+    ]
+    PROFILE_COLUMNS_FOR_QUERY = PROFILE_BASE_COLUMNS + PROFILE_METRIC_COLUMNS
+    SOCIAL_COLUMNS_FOR_QUERY = [
+        'wallet_address',
+        'pumpfun_username',
+        'twitter_username',
+        'telegram_channel',
+        'kolscan_name',
+        'cabalspy_name',
+        'axiom_kol_name'
+    ]
+    def __init__(self, clickhouse_client: Any, neo4j_driver: Any):
+        self.db_client = clickhouse_client
+        self.graph_client = neo4j_driver
+        print("DataFetcher instantiated.")
+    def get_all_mints(self, start_date: Optional[datetime.datetime] = None) -> List[Dict[str, Any]]:
+        """
+        Fetches a list of all mint events to serve as dataset samples.
+        Can be filtered to only include mints on or after a given start_date.
+        """
+        query = "SELECT mint_address, timestamp, creator_address, protocol, token_name, token_symbol, token_uri, total_supply, token_decimals FROM mints"
+        params = {}
+        where_clauses = []
+        if start_date:
+            where_clauses.append("timestamp >= %(start_date)s")
+            params['start_date'] = start_date
+        if where_clauses:
+            query += " WHERE " + " AND ".join(where_clauses)
+        print(f"INFO: Executing query to get all mints: `{query}` with params: {params}")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            result = [dict(zip(columns, row)) for row in rows]
+            if not result:
+                return []
+            return result
+        except Exception as e:
+            print(f"ERROR: Failed to fetch token addresses from ClickHouse: {e}")
+            print("INFO: Falling back to mock token addresses for development.")
+            return [{'mint_address': 'tknA_real', 'timestamp': datetime.datetime.now(datetime.timezone.utc), 'creator_address': 'addr_Creator_Real', 'protocol': 0}]
+    def fetch_mint_record(self, token_address: str) -> Dict[str, Any]:
+        """
+        Fetches the raw mint record for a token from the 'mints' table.
+        """
+        query = f"SELECT timestamp, creator_address, mint_address, protocol FROM mints WHERE mint_address = '{token_address}' ORDER BY timestamp ASC LIMIT 1"
+        print(f"INFO: Executing query to fetch mint record: `{query}`")
+        # Assumes the client returns a list of dicts or can be converted
+        # Using column names from your schema
+        columns = ['timestamp', 'creator_address', 'mint_address', 'protocol']
+        try:
+            result = self.db_client.execute(query)
+            if not result or not result[0]:
+                 raise ValueError(f"No mint event found for token {token_address}")
+            # Convert the tuple result into a dictionary
+            record = dict(zip(columns, result[0]))
+            return record
+        except Exception as e:
+            print(f"ERROR: Failed to fetch mint record for {token_address}: {e}")
+            print("INFO: Falling back to mock mint record for development.")
+            # Fallback for development if DB connection fails
+            return {
+                'timestamp': datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1),
+                'creator_address': 'addr_Creator_Real',
+                'mint_address': token_address,
+                'protocol': vocab.PROTOCOL_TO_ID.get("Pump V1", 0)
+            }
+    def fetch_wallet_profiles(self, wallet_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, Dict[str, Any]]:
+        """
+        Convenience wrapper around fetch_wallet_profiles_and_socials for profile-only data.
+        """
+        profiles, _ = self.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
+        return profiles
+    def fetch_wallet_socials(self, wallet_addresses: List[str]) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches wallet social records for a list of wallet addresses.
+        Returns a dictionary mapping wallet_address to its social data.
+        """
+        if not wallet_addresses:
+            return {}
+        query = "SELECT * FROM wallet_socials WHERE wallet_address IN %(addresses)s"
+        params = {'addresses': wallet_addresses}
+        print(f"INFO: Executing query to fetch wallet socials for {len(wallet_addresses)} wallets.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return {}
+            columns = [col[0] for col in columns_info]
+            socials = {}
+            for row in rows:
+                social_dict = dict(zip(columns, row))
+                wallet_addr = social_dict.get('wallet_address')
+                if wallet_addr:
+                    socials[wallet_addr] = social_dict
+            return socials
+        except Exception as e:
+            print(f"ERROR: Failed to fetch wallet socials: {e}")
+            print("INFO: Returning empty dictionary for wallet socials.")
+            return {}
+    def fetch_wallet_profiles_and_socials(self,
+                                          wallet_addresses: List[str],
+                                          T_cutoff: datetime.datetime) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]:
+        """
+        Fetches wallet profiles (time-aware) and socials for all requested wallets in a single query.
+        Returns two dictionaries: profiles, socials.
+        """
+        if not wallet_addresses:
+            return {}, {}
+        social_columns = self.SOCIAL_COLUMNS_FOR_QUERY
+        profile_base_cols = self.PROFILE_BASE_COLUMNS
+        profile_metric_cols = self.PROFILE_METRIC_COLUMNS
+        profile_base_str = ",\n                ".join(profile_base_cols)
+        metric_projection_cols = ['wallet_address', 'updated_at'] + profile_metric_cols
+        profile_metric_str = ",\n                ".join(metric_projection_cols)
+        profile_base_select_cols = [col for col in profile_base_cols if col != 'wallet_address']
+        profile_metric_select_cols = [
+            col for col in profile_metric_cols if col not in ('wallet_address',)
+        ]
+        social_select_cols = [col for col in social_columns if col != 'wallet_address']
+        select_expressions = []
+        for col in profile_base_select_cols:
+            select_expressions.append(f"lp.{col} AS profile__{col}")
+        for col in profile_metric_select_cols:
+            select_expressions.append(f"lm.{col} AS profile__{col}")
+        for col in social_select_cols:
+            select_expressions.append(f"ws.{col} AS social__{col}")
+        select_clause = ""
+        if select_expressions:
+            select_clause = ",\n            " + ",\n            ".join(select_expressions)
+        query = f"""
+        WITH ranked_profiles AS (
+            SELECT
+                {profile_base_str},
+                ROW_NUMBER() OVER (PARTITION BY wallet_address ORDER BY updated_at DESC) AS rn
+            FROM wallet_profiles
+            WHERE wallet_address IN %(addresses)s
+        ),
+        latest_profiles AS (
+            SELECT
+                {profile_base_str}
+            FROM ranked_profiles
+            WHERE rn = 1
+        ),
+        ranked_metrics AS (
+            SELECT
+                {profile_metric_str},
+                ROW_NUMBER() OVER (PARTITION BY wallet_address ORDER BY updated_at DESC) AS rn
+            FROM wallet_profile_metrics
+            WHERE
+                wallet_address IN %(addresses)s
+                AND updated_at <= %(T_cutoff)s
+        ),
+        latest_metrics AS (
+            SELECT
+                {profile_metric_str}
+            FROM ranked_metrics
+            WHERE rn = 1
+        ),
+        requested_wallets AS (
+            SELECT DISTINCT wallet_address
+            FROM (SELECT arrayJoin(%(addresses)s) AS wallet_address)
+        )
+        SELECT
+            rw.wallet_address AS wallet_address
+            {select_clause}
+        FROM requested_wallets AS rw
+        LEFT JOIN latest_profiles AS lp ON rw.wallet_address = lp.wallet_address
+        LEFT JOIN latest_metrics AS lm ON rw.wallet_address = lm.wallet_address
+        LEFT JOIN wallet_socials AS ws ON rw.wallet_address = ws.wallet_address;
+        """
+        params = {'addresses': wallet_addresses, 'T_cutoff': T_cutoff}
+        print(f"INFO: Executing combined query for profiles+socials on {len(wallet_addresses)} wallets.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return {}, {}
+            columns = [col[0] for col in columns_info]
+            profiles: Dict[str, Dict[str, Any]] = {}
+            socials: Dict[str, Dict[str, Any]] = {}
+            profile_keys = [f"profile__{col}" for col in (profile_base_select_cols + profile_metric_select_cols)]
+            social_keys = [f"social__{col}" for col in social_select_cols]
+            for row in rows:
+                row_dict = dict(zip(columns, row))
+                wallet_addr = row_dict.get('wallet_address')
+                if not wallet_addr:
+                    continue
+                profile_data = {}
+                if profile_keys:
+                    for pref_key in profile_keys:
+                        if pref_key in row_dict:
+                            value = row_dict[pref_key]
+                            profile_data[pref_key.replace('profile__', '')] = value
+                if profile_data and any(value is not None for value in profile_data.values()):
+                    profile_data['wallet_address'] = wallet_addr
+                    profiles[wallet_addr] = profile_data
+                social_data = {}
+                if social_keys:
+                    for pref_key in social_keys:
+                        if pref_key in row_dict:
+                            value = row_dict[pref_key]
+                            social_data[pref_key.replace('social__', '')] = value
+                if social_data and any(value is not None for value in social_data.values()):
+                    social_data['wallet_address'] = wallet_addr
+                    socials[wallet_addr] = social_data
+            return profiles, socials
+        except Exception as e:
+            print(f"ERROR: Combined profile/social query failed: {e}")
+            print("INFO: Falling back to separate queries.")
+            profiles = self.fetch_wallet_profiles(wallet_addresses, T_cutoff)
+            socials = self.fetch_wallet_socials(wallet_addresses)
+            return profiles, socials
+    def fetch_wallet_holdings(self, wallet_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Fetches top 3 wallet holding records for a list of wallet addresses that were active at T_cutoff.
+        Returns a dictionary mapping wallet_address to a LIST of its holding data.
+        """
+        if not wallet_addresses:
+            return {}
+        # --- NEW: Time-aware query based on user's superior logic ---
+        # 1. For each holding, find the latest state at or before T_cutoff.
+        # 2. Filter for holdings where the balance was greater than 0.
+        # 3. Rank these active holdings by USD volume and take the top 3 per wallet.
+        query = """
+        WITH point_in_time_holdings AS (
+            SELECT
+                *,
+                COALESCE(history_bought_cost_sol, 0) + COALESCE(history_sold_income_sol, 0) AS total_volume_usd,
+                ROW_NUMBER() OVER(PARTITION BY wallet_address, mint_address ORDER BY updated_at DESC) as rn_per_holding
+            FROM wallet_holdings
+            WHERE
+                wallet_address IN %(addresses)s
+                AND updated_at <= %(T_cutoff)s
+        ),
+        ranked_active_holdings AS (
+            SELECT *,
+                   ROW_NUMBER() OVER(PARTITION BY wallet_address ORDER BY total_volume_usd DESC) as rn_per_wallet
+            FROM point_in_time_holdings
+            WHERE rn_per_holding = 1 AND current_balance > 0
+        )
+        SELECT *
+        FROM ranked_active_holdings
+        WHERE rn_per_wallet <= 3;
+        """
+        params = {'addresses': wallet_addresses, 'T_cutoff': T_cutoff}
+        print(f"INFO: Executing query to fetch wallet holdings for {len(wallet_addresses)} wallets.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return {}
+            columns = [col[0] for col in columns_info]
+            holdings = defaultdict(list)
+            for row in rows:
+                holding_dict = dict(zip(columns, row))
+                wallet_addr = holding_dict.get('wallet_address')
+                if wallet_addr:
+                    holdings[wallet_addr].append(holding_dict)
+            return dict(holdings)
+        except Exception as e:
+            print(f"ERROR: Failed to fetch wallet holdings: {e}")
+            print("INFO: Returning empty dictionary for wallet holdings.")
+            return {}
+    def fetch_graph_links(self,
+                          initial_addresses: List[str],
+                          T_cutoff: datetime.datetime,
+                          max_degrees: int = 2) -> Tuple[Dict[str, str], Dict[str, Dict[str, Any]]]:
+        """
+        Fetches graph links from Neo4j, traversing up to a max degree of separation.
+        Args:
+            initial_addresses: A list of starting wallet or token addresses.
+            max_degrees: The maximum number of hops to traverse in the graph.
+        Returns:
+            A tuple containing:
+            - A dictionary mapping entity addresses to their type ('Wallet' or 'Token').
+            - A dictionary of aggregated links, structured for the GraphUpdater.
+        """
+        if not initial_addresses:
+            return set(), {}
+        cutoff_ts = int(T_cutoff.timestamp())
+        print(f"INFO: Fetching graph links up to {max_degrees} degrees for {len(initial_addresses)} initial entities...")
+        try:
+            with self.graph_client.session() as session:
+                all_entities = {addr: 'Token' for addr in initial_addresses} # Assume initial are tokens
+                newly_found_entities = set(initial_addresses)
+                aggregated_links = defaultdict(lambda: {'links': [], 'edges': []})
+                for i in range(max_degrees):
+                    if not newly_found_entities:
+                        break
+                    print(f"  - Degree {i+1}: Traversing from {len(newly_found_entities)} new entities...")
+                    # Cypher query to find direct neighbors of the current frontier
+                    query = """
+                    MATCH (a)-[r]-(b)
+                    WHERE a.address IN $addresses
+                    RETURN a.address AS source_address, type(r) AS link_type, properties(r) AS link_props, b.address AS dest_address, labels(b)[0] AS dest_type
+                    """
+                    params = {'addresses': list(newly_found_entities)}
+                    result = session.run(query, params)
+                    current_degree_new_entities = set()
+                    for record in result:
+                        link_type = record['link_type']
+                        link_props = dict(record['link_props'])
+                        link_ts_raw = link_props.get('timestamp')
+                        try:
+                            link_ts = int(link_ts_raw)
+                        except (TypeError, ValueError):
+                            continue
+                        if link_ts > cutoff_ts:
+                            continue
+                        source_addr = record['source_address']
+                        dest_addr = record['dest_address']
+                        dest_type = record['dest_type']
+                        # Add the link and edge data
+                        aggregated_links[link_type]['links'].append(link_props)
+                        aggregated_links[link_type]['edges'].append((source_addr, dest_addr))
+                        # If we found a new entity, add it to the set for the next iteration
+                        if dest_addr not in all_entities.keys():
+                            current_degree_new_entities.add(dest_addr)
+                            all_entities[dest_addr] = dest_type
+                    newly_found_entities = current_degree_new_entities
+                return all_entities, dict(aggregated_links)
+        except Exception as e:
+            print(f"ERROR: Failed to fetch graph links from Neo4j: {e}")
+            return {addr: 'Token' for addr in initial_addresses}, {}
+    def fetch_token_data(self, token_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches the latest token data for each address at or before T_cutoff.
+        Returns a dictionary mapping token_address to its data.
+        """
+        if not token_addresses:
+            return {}
+        # --- NEW: Time-aware query for historical token data ---
+        query = """
+        WITH ranked_tokens AS (
+            SELECT
+                *,
+                ROW_NUMBER() OVER (PARTITION BY token_address ORDER BY updated_at DESC) as rn
+            FROM tokens
+            WHERE
+                token_address IN %(addresses)s
+                AND updated_at <= %(T_cutoff)s
+        )
+        SELECT token_address, name, symbol, token_uri, protocol, total_supply, decimals
+        FROM ranked_tokens
+        WHERE rn = 1;
+        """
+        params = {'addresses': token_addresses, 'T_cutoff': T_cutoff}
+        print(f"INFO: Executing query to fetch token data for {len(token_addresses)} tokens.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return {}
+            # Get column names from the query result description
+            columns = [col[0] for col in columns_info]
+            tokens = {}
+            for row in rows:
+                token_dict = dict(zip(columns, row))
+                token_addr = token_dict.get('token_address')
+                if token_addr:
+                    # The 'tokens' table in the schema has 'token_address' but the
+                    # collator expects 'address'. We'll add it for compatibility.
+                    token_dict['address'] = token_addr
+                    tokens[token_addr] = token_dict
+            return tokens
+        except Exception as e:
+            print(f"ERROR: Failed to fetch token data: {e}")
+            print("INFO: Returning empty dictionary for token data.")
+            return {}
+    def fetch_deployed_token_details(self, token_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches historical details for deployed tokens at or before T_cutoff.
+        """
+        if not token_addresses:
+            return {}
+        # --- NEW: Time-aware query for historical deployed token details ---
+        query = """
+        WITH ranked_tokens AS (
+            SELECT
+                *,
+                ROW_NUMBER() OVER (PARTITION BY token_address ORDER BY updated_at DESC) as rn
+            FROM tokens
+            WHERE
+                token_address IN %(addresses)s
+                AND updated_at <= %(T_cutoff)s
+        ),
+        ranked_token_metrics AS (
+            SELECT
+                token_address,
+                ath_price_usd,
+                ROW_NUMBER() OVER (PARTITION BY token_address ORDER BY updated_at DESC) as rn
+            FROM token_metrics
+            WHERE
+                token_address IN %(addresses)s
+                AND updated_at <= %(T_cutoff)s
+        ),
+        latest_tokens AS (
+            SELECT *
+            FROM ranked_tokens
+            WHERE rn = 1
+        ),
+        latest_token_metrics AS (
+            SELECT *
+            FROM ranked_token_metrics
+            WHERE rn = 1
+        )
+        SELECT
+            lt.token_address,
+            lt.created_at,
+            lt.updated_at,
+            ltm.ath_price_usd,
+            lt.total_supply,
+            lt.decimals,
+            (lt.launchpad != lt.protocol) AS has_migrated
+        FROM latest_tokens AS lt
+        LEFT JOIN latest_token_metrics AS ltm
+            ON lt.token_address = ltm.token_address;
+        """
+        params = {'addresses': token_addresses, 'T_cutoff': T_cutoff}
+        print(f"INFO: Executing query to fetch deployed token details for {len(token_addresses)} tokens.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return {}
+            columns = [col[0] for col in columns_info]
+            token_details = {row[0]: dict(zip(columns, row)) for row in rows}
+            return token_details
+        except Exception as e:
+            print(f"ERROR: Failed to fetch deployed token details: {e}")
+            return {}
+    def fetch_trades_for_token(self, token_address: str, T_cutoff: datetime.datetime, count_threshold: int, early_limit: int, recent_limit: int) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Fetches trades for a token, using a 3-part H/B/H strategy if the total count exceeds a threshold.
+        Returns three lists: early_trades, middle_trades, recent_trades.
+        """
+        if not token_address:
+            return [], [], []
+        params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        # 1. Get the total count of trades for the token before the cutoff
+        count_query = "SELECT count() FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s"
+        try:
+            total_trades = self.db_client.execute(count_query, params)[0][0]
+            print(f"INFO: Found {total_trades} total trades for token {token_address} before {T_cutoff}.")
+        except Exception as e:
+            print(f"ERROR: Could not count trades for token {token_address}: {e}")
+            return [], [], []
+        # 2. Decide which query to use based on the count
+        if total_trades < count_threshold:
+            print("INFO: Fetching all trades (count is below H/B/H threshold).")
+            query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC"
+            try:
+                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+                if not rows: return [], [], []
+                columns = [col[0] for col in columns_info]
+                all_trades = [dict(zip(columns, row)) for row in rows]
+                # When not using HBH, all trades are considered "early"
+                return all_trades, [], []
+            except Exception as e:
+                print(f"ERROR: Failed to fetch all trades for token {token_address}: {e}")
+                return [], [], []
+        # 3. Use the H/B/H strategy if the count is high
+        print("INFO: Fetching trades using 3-part High-Def/Blurry/High-Def strategy.")
+        try:
+            # Fetch Early (High-Def)
+            early_query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC LIMIT %(limit)s"
+            early_rows, early_cols_info = self.db_client.execute(early_query, {'token_address': token_address, 'T_cutoff': T_cutoff, 'limit': early_limit}, with_column_types=True)
+            early_trades = [dict(zip([c[0] for c in early_cols_info], r)) for r in early_rows] if early_rows else []
+            # Fetch Recent (High-Def)
+            recent_query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp DESC LIMIT %(limit)s"
+            recent_rows, recent_cols_info = self.db_client.execute(recent_query, {'token_address': token_address, 'T_cutoff': T_cutoff, 'limit': recent_limit}, with_column_types=True)
+            recent_trades = [dict(zip([c[0] for c in recent_cols_info], r)) for r in recent_rows] if recent_rows else []
+            recent_trades.reverse() # Order ASC
+            # Fetch Middle (Blurry - successful trades only)
+            middle_trades = []
+            if early_trades and recent_trades:
+                start_middle_ts = early_trades[-1]['timestamp']
+                end_middle_ts = recent_trades[0]['timestamp']
+                if start_middle_ts < end_middle_ts:
+                    middle_query = """
+                    SELECT * FROM trades
+                    WHERE base_address = %(token_address)s
+                      AND success = true
+                      AND timestamp > %(start_ts)s
+                      AND timestamp < %(end_ts)s
+                    ORDER BY timestamp ASC
+                    """
+                    middle_params = {'token_address': token_address, 'start_ts': start_middle_ts, 'end_ts': end_middle_ts}
+                    middle_rows, middle_cols_info = self.db_client.execute(middle_query, middle_params, with_column_types=True)
+                    middle_trades = [dict(zip([c[0] for c in middle_cols_info], r)) for r in middle_rows] if middle_rows else []
+            return early_trades, middle_trades, recent_trades
+        except Exception as e:
+            print(f"ERROR: Failed to fetch H/B/H trades for token {token_address}: {e}")
+            return [], [], []
+    def fetch_future_trades_for_token(self,
+                                      token_address: str,
+                                      start_ts: datetime.datetime,
+                                      end_ts: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches successful trades for a token in the window (start_ts, end_ts].
+        Used for constructing label targets beyond the cutoff.
+        """
+        if not token_address or start_ts is None or end_ts is None or start_ts >= end_ts:
+            return []
+        query = """
+        SELECT *
+        FROM trades
+        WHERE base_address = %(token_address)s
+          AND success = true
+          AND timestamp > %(start_ts)s
+          AND timestamp <= %(end_ts)s
+        ORDER BY timestamp ASC
+        """
+        params = {
+            'token_address': token_address,
+            'start_ts': start_ts,
+            'end_ts': end_ts
+        }
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch future trades for token {token_address}: {e}")
+            return []
+    def fetch_transfers_for_token(self, token_address: str, T_cutoff: datetime.datetime, min_amount_threshold: float = 10_000_000) -> List[Dict[str, Any]]:
+        """
+        Fetches all transfers for a token before T_cutoff, filtering out small amounts.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT * FROM transfers
+        WHERE mint_address = %(token_address)s
+          AND timestamp <= %(T_cutoff)s
+          AND amount_decimal >= %(min_amount)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token_address': token_address, 'T_cutoff': T_cutoff, 'min_amount': min_amount_threshold}
+        print(f"INFO: Fetching significant transfers for {token_address} (amount >= {min_amount_threshold}).")
+        try:
+            # This query no longer uses H/B/H, it fetches all significant transfers
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows: return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch transfers for token {token_address}: {e}")
+            return []
+    def fetch_pool_creations_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches pool creation records where the token is the base asset.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            signature,
+            timestamp,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            creator_address,
+            pool_address,
+            base_address,
+            quote_address,
+            lp_token_address,
+            initial_base_liquidity,
+            initial_quote_liquidity,
+            base_decimals,
+            quote_decimals
+        FROM pool_creations
+        WHERE base_address = %(token_address)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        print(f"INFO: Fetching pool creation events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch pool creations for token {token_address}: {e}")
+            return []
+    def fetch_liquidity_changes_for_pools(self, pool_addresses: List[str], T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches liquidity change records for the given pools up to T_cutoff.
+        """
+        if not pool_addresses:
+            return []
+        query = """
+        SELECT
+            signature,
+            timestamp,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            change_type,
+            lp_provider,
+            pool_address,
+            base_amount,
+            quote_amount
+        FROM liquidity
+        WHERE pool_address IN %(pool_addresses)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'pool_addresses': pool_addresses, 'T_cutoff': T_cutoff}
+        print(f"INFO: Fetching liquidity change events for {len(pool_addresses)} pool(s).")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch liquidity changes for pools {pool_addresses}: {e}")
+            return []
+    def fetch_fee_collections_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches fee collection events where the token appears as either token_0 or token_1.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            recipient_address,
+            token_0_mint_address,
+            token_0_amount,
+            token_1_mint_address,
+            token_1_amount
+        FROM fee_collections
+        WHERE (token_0_mint_address = %(token)s OR token_1_mint_address = %(token)s)
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        print(f"INFO: Fetching fee collection events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch fee collections for token {token_address}: {e}")
+            return []
+    def fetch_migrations_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches migration records for a given token up to T_cutoff.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            mint_address,
+            virtual_pool_address,
+            pool_address,
+            migrated_base_liquidity,
+            migrated_quote_liquidity
+        FROM migrations
+        WHERE mint_address = %(token)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        print(f"INFO: Fetching migrations for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch migrations for token {token_address}: {e}")
+            return []
+    def fetch_burns_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches burn events for a given token up to T_cutoff.
+        Schema: burns(timestamp, signature, slot, success, error, priority_fee, mint_address, source, amount, amount_decimal, source_balance)
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            mint_address,
+            source,
+            amount,
+            amount_decimal,
+            source_balance
+        FROM burns
+        WHERE mint_address = %(token)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        print(f"INFO: Fetching burn events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch burns for token {token_address}: {e}")
+            return []
+    def fetch_supply_locks_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches supply lock events for a given token up to T_cutoff.
+        Schema: supply_locks(timestamp, signature, slot, success, error, priority_fee, protocol, contract_address, sender, recipient, mint_address, total_locked_amount, final_unlock_timestamp)
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            contract_address,
+            sender,
+            recipient,
+            mint_address,
+            total_locked_amount,
+            final_unlock_timestamp
+        FROM supply_locks
+        WHERE mint_address = %(token)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        print(f"INFO: Fetching supply lock events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch supply locks for token {token_address}: {e}")
+            return []
+    def fetch_token_holders_for_snapshot(self, token_address: str, T_cutoff: datetime.datetime, limit: int = 200) -> List[Dict[str, Any]]:
+        """
+        Fetch top holders for a token at or before T_cutoff for snapshot purposes.
+        Returns rows with wallet_address and current_balance (>0), ordered by balance desc.
+        """
+        if not token_address:
+            return []
+        query = """
+        WITH point_in_time_holdings AS (
+            SELECT *,
+                   ROW_NUMBER() OVER(PARTITION BY wallet_address, mint_address ORDER BY updated_at DESC) as rn_per_holding
+            FROM wallet_holdings
+            WHERE mint_address = %(token)s AND updated_at <= %(T_cutoff)s
+        )
+        SELECT wallet_address, current_balance
+        FROM point_in_time_holdings
+        WHERE rn_per_holding = 1 AND current_balance > 0
+        ORDER BY current_balance DESC
+        LIMIT %(limit)s;
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff, 'limit': int(limit)}
+        print(f"INFO: Fetching top holders for snapshot for {token_address} (limit {limit}).")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch token holders for {token_address}: {e}")
+            return []
+    def fetch_total_holders_count_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> int:
+        """
+        Returns the total number of wallets holding the token (current_balance > 0)
+        at or before T_cutoff.
+        """
+        if not token_address:
+            return 0
+        query = """
+        WITH point_in_time_holdings AS (
+            SELECT *,
+                   ROW_NUMBER() OVER(PARTITION BY wallet_address, mint_address ORDER BY updated_at DESC) as rn_per_holding
+            FROM wallet_holdings
+            WHERE mint_address = %(token)s AND updated_at <= %(T_cutoff)s
+        )
+        SELECT count()
+        FROM point_in_time_holdings
+        WHERE rn_per_holding = 1 AND current_balance > 0;
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        print(f"INFO: Counting total holders for {token_address} at cutoff.")
+        try:
+            rows = self.db_client.execute(query, params)
+            if not rows:
+                return 0
+            return int(rows[0][0])
+        except Exception as e:
+            print(f"ERROR: Failed to count total holders for token {token_address}: {e}")
+            return 0

data/data_loader.py ADDED Viewed

	@@ -0,0 +1,1657 @@

+import torch
+from collections import defaultdict
+import datetime
+import requests
+from io import BytesIO
+from torch.utils.data import Dataset, IterableDataset
+from PIL import Image
+from typing import List, Dict, Any, Optional, Union, Tuple
+from pathlib import Path
+import numpy as np
+from bisect import bisect_left, bisect_right
+# We need the vocabulary for IDs and the processor for the pooler
+import models.vocabulary as vocab
+from models.multi_modal_processor import MultiModalEncoder
+from data.data_fetcher import DataFetcher # NEW: Import the DataFetcher
+# --- NEW: Hardcoded decimals for common quote tokens ---
+QUOTE_TOKEN_DECIMALS = {
+    'So11111111111111111111111111111111111111112': 9,  # SOL
+    'EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v': 6,  # USDC
+    'Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB': 6,  # USDT
+}
+# --- NEW: Hyperparameters for trade event classification ---
+LARGE_TRADE_USD_THRESHOLD = 100.0
+LARGE_TRADE_SUPPLY_PCT_THRESHOLD = 0.03  # 3% of supply
+LARGE_TRANSFER_SUPPLY_PCT_THRESHOLD = 0.03 # 3% of supply
+SMART_WALLET_PNL_THRESHOLD = 3.0  # 300% PNL
+SMART_WALLET_USD_THRESHOLD = 20000.0
+# --- NEW: Hyperparameters for H/B/H Event Fetching ---
+EVENT_COUNT_THRESHOLD_FOR_HBH = 30000  # If total events > this, use H/B/H
+HBH_EARLY_EVENT_LIMIT = 10000
+HBH_RECENT_EVENT_LIMIT = 15000
+# --- NEW: OHLC Sequence Length Constant ---
+OHLC_SEQ_LEN = 300 # 4 minutes of chart
+MIN_AMOUNT_TRANSFER_SUPPLY = 0.0 # 1.0% of total supply
+# Interval for HolderSnapshot events (seconds)
+HOLDER_SNAPSHOT_INTERVAL_SEC = 300
+HOLDER_SNAPSHOT_TOP_K = 200
+class EmbeddingPooler:
+    """
+    A helper class to manage the collection and encoding of unique text/image items
+    for a single data sample.
+    """
+    def __init__(self):
+        self.pool_map = {}
+        self.next_idx = 1  # 0 is padding
+    def get_idx(self, item: Any) -> int:
+        """
+        Returns a unique index for a given item (string or image).
+        - Returns 0 for None or empty strings.
+        - Deduplicates identical text and image objects.
+        """
+        if item is None:
+            return 0
+        # Handle text case
+        if isinstance(item, str):
+            if not item.strip():  # skip empty or whitespace-only strings
+                return 0
+            key = item.strip()  # use normalized text key
+        elif isinstance(item, Image.Image):
+            key = id(item)  # unique memory address for images
+        else:
+            key = item  # fallback: use object itself if hashable
+        if key not in self.pool_map:
+            self.pool_map[key] = {'item': item, 'idx': self.next_idx}
+            self.next_idx += 1
+        return self.pool_map[key]['idx']
+    def get_all_items(self) -> List[Dict[str, Any]]:
+        """
+        Returns a list of all unique items, sorted by their assigned index.
+        """
+        if not self.pool_map:
+            return []
+        return sorted(self.pool_map.values(), key=lambda x: x['idx'])
+class OracleDataset(Dataset):
+    """
+    Dataset class for the Oracle model. It fetches, processes, and structures
+    all on-chain and off-chain data for a given token to create a comprehensive
+    input sequence for the model.
+    """
+    def __init__(self,
+                 data_fetcher: DataFetcher, # NEW: Pass the fetcher instance
+                 horizons_seconds: List[int] = [],
+                 quantiles: List[float] = [],
+                 max_samples: Optional[int] = None,
+                 ohlc_stats_path: Union[str, Path] = "./data/ohlc_stats.npz", # NEW: Add stats path parameter
+                 token_allowlist: Optional[List[str]] = None,
+                 t_cutoff_seconds: int = 60,
+                 cache_dir: Optional[Union[str, Path]] = None,
+                 start_date: Optional[datetime.datetime] = None,
+                 min_trade_usd: float = 0.0):
+        # --- NEW: Create a persistent requests session for efficiency ---
+        self.http_session = requests.Session()
+        self.fetcher = data_fetcher
+        self.cache_dir = Path(cache_dir) if cache_dir else None
+        # If a fetcher is provided, we can determine the number of samples.
+        # Otherwise, we are likely in a test mode where __len__ might not be called
+        # or is used with a mock length.
+        self.t_cutoff_seconds = max(0, int(t_cutoff_seconds or 0))
+        self.token_allowlist = set(token_allowlist) if token_allowlist else None
+        if self.cache_dir and self.cache_dir.is_dir():
+            print(f"INFO: Initializing dataset in offline (cached) mode from: {self.cache_dir}")
+            # Scan for cached files to determine length
+            self.cached_files = sorted(self.cache_dir.glob("sample_*.pt"), key=lambda p: int(p.stem.split('_')[1]))
+            if not self.cached_files:
+                raise RuntimeError(f"Cache directory '{self.cache_dir}' provided but contains no 'sample_*.pt' files.")
+            self.num_samples = len(self.cached_files)
+            if max_samples is not None:
+                self.num_samples = min(max_samples, self.num_samples)
+                self.cached_files = self.cached_files[:self.num_samples]
+            print(f"INFO: Found {self.num_samples} cached samples to use.")
+            self.sampled_mints = [] # Not needed in cached mode
+            self.available_mints = []
+        elif self.fetcher:
+            print(f"INFO: Initializing dataset in online (generation) mode...")
+            self.available_mints = self.fetcher.get_all_mints(start_date=start_date)
+            if not self.available_mints:
+                raise RuntimeError("Dataset initialization failed: no mint records returned from data fetcher.")
+            if self.token_allowlist:
+                filtered_mints = [
+                    mint for mint in self.available_mints
+                    if mint.get('mint_address') in self.token_allowlist
+                ]
+                if not filtered_mints:
+                    raise RuntimeError(f"No mint records matched the provided token allowlist: {token_allowlist}")
+                self.available_mints = filtered_mints
+            total_mints = len(self.available_mints)
+            if max_samples is None:
+                self.num_samples = total_mints
+                self.sampled_mints = self.available_mints
+            else:
+                self.num_samples = min(max_samples, total_mints)
+                if self.num_samples < total_mints:
+                    print(f"INFO: Limiting dataset to first {self.num_samples} of {total_mints} available mints.")
+                self.sampled_mints = self.available_mints[:self.num_samples]
+        else:
+            self.available_mints = []
+            self.sampled_mints = []
+            self.num_samples = 1 if max_samples is None else max_samples
+        self.horizons_seconds = sorted(set(horizons_seconds))
+        self.quantiles = quantiles
+        self.num_outputs = len(self.horizons_seconds) * len(self.quantiles)
+        # --- NEW: Load global OHLC normalization stats ---
+        stats_path = Path(ohlc_stats_path)
+        if not stats_path.exists():
+            raise FileNotFoundError(f"Required OHLC stats file not found: {stats_path}")
+        stats = np.load(stats_path)
+        self.ohlc_price_mean = float(stats.get('mean_price_usd', 0.0))
+        self.ohlc_price_std = float(stats.get('std_price_usd', 1.0)) or 1.0
+        self.min_trade_usd = min_trade_usd
+    def __len__(self) -> int:
+        return self.num_samples
+    def _normalize_price_series(self, values: List[float]) -> List[float]:
+        if not values:
+            return values
+        denom = self.ohlc_price_std if abs(self.ohlc_price_std) > 1e-9 else 1.0
+        return [(float(v) - self.ohlc_price_mean) / denom for v in values]
+    def _compute_future_return_labels(self,
+                                      anchor_price: Optional[float],
+                                      anchor_timestamp: int,
+                                      price_series: List[Tuple[int, float]]) -> Tuple[torch.Tensor, torch.Tensor, List[Dict[str, Any]]]:
+        if self.num_outputs == 0:
+            return torch.zeros(0), torch.zeros(0), []
+        if anchor_price is None or abs(anchor_price) < 1e-9 or not price_series:
+            return torch.zeros(self.num_outputs), torch.zeros(self.num_outputs), []
+        ts_list = [int(entry[0]) for entry in price_series]
+        price_list = [float(entry[1]) for entry in price_series]
+        if not ts_list:
+            return torch.zeros(self.num_outputs), torch.zeros(self.num_outputs), []
+        last_ts = ts_list[-1]
+        label_values: List[float] = []
+        mask_values: List[float] = []
+        debug_entries: List[Dict[str, Any]] = []
+        for horizon in self.horizons_seconds:
+            target_ts = anchor_timestamp + horizon
+            if target_ts > last_ts:
+                horizon_mask = 0.0
+                horizon_return = 0.0
+                future_price = None
+            else:
+                idx = bisect_right(ts_list, target_ts) - 1
+                if idx < 0:
+                    horizon_mask = 0.0
+                    horizon_return = 0.0
+                    future_price = None
+                else:
+                    future_price = price_list[idx]
+                    horizon_return = (future_price - anchor_price) / anchor_price
+                    horizon_return = max(min(horizon_return, 10.0), -10.0)
+                    horizon_mask = 1.0
+            for _ in self.quantiles:
+                label_values.append(horizon_return)
+                mask_values.append(horizon_mask)
+            debug_entries.append({
+                'horizon': horizon,
+                'target_ts': target_ts,
+                'future_price': future_price,
+                'return': horizon_return,
+                'mask': horizon_mask
+            })
+        return (torch.tensor(label_values, dtype=torch.float32),
+                torch.tensor(mask_values, dtype=torch.float32),
+                debug_entries)
+    def _generate_onchain_snapshots(
+        self,
+        token_address: str,
+        t0_timestamp: int,
+        T_cutoff: datetime.datetime,
+        interval_sec: int,
+        trade_events: List[Dict[str, Any]],
+        transfer_events: List[Dict[str, Any]],
+        aggregation_trades: List[Dict[str, Any]],
+        wallet_data: Dict[str, Any],
+        total_supply_dec: float,
+        _register_event_fn
+    ) -> None:
+        # Prepare helper sets and maps (static sniper set based on earliest buyers)
+        all_buy_trades = sorted([e for e in trade_events if e.get('trade_direction') == 0 and e.get('success', False)], key=lambda x: x['timestamp'])
+        sniper_wallets = []
+        seen_buyers = set()
+        for e in all_buy_trades:
+            wa = e['wallet_address']
+            if wa not in seen_buyers:
+                sniper_wallets.append(wa)
+                seen_buyers.add(wa)
+            if len(sniper_wallets) >= 70:
+                break
+        sniper_set = set(sniper_wallets)
+        KOL_NAME_KEYS = ['kolscan_name', 'cabalspy_name', 'axiom_kol_name']
+        # Build time arrays for price lookup
+        agg_ts = [int(t['timestamp']) for t in aggregation_trades] if aggregation_trades else []
+        agg_price = [float(t.get('price_usd', 0.0) or 0.0) for t in aggregation_trades] if aggregation_trades else []
+        start_ts = t0_timestamp
+        end_ts = int(self._timestamp_to_order_value(T_cutoff)) if hasattr(self, '_timestamp_to_order_value') else int(T_cutoff.timestamp())
+        if end_ts - start_ts < interval_sec:
+            oc_snapshot_times = [end_ts]
+        else:
+            steps = (end_ts - start_ts) // interval_sec
+            oc_snapshot_times = [start_ts + i * interval_sec for i in range(1, steps + 1)]
+        buyers_seen_global = set()
+        prev_holders_count = 0
+        for ts_value in oc_snapshot_times:
+            window_start = ts_value - interval_sec
+            trades_win = [e for e in trade_events if e.get('success', False) and window_start < e['timestamp'] <= ts_value]
+            xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
+            # Per-snapshot holder distribution at ts_value
+            cutoff_dt_ts = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
+            holder_records_ts = self.fetcher.fetch_token_holders_for_snapshot(token_address, cutoff_dt_ts, limit=HOLDER_SNAPSHOT_TOP_K)
+            holder_entries_ts = []
+            for rec in holder_records_ts:
+                addr = rec.get('wallet_address')
+                try:
+                    bal = float(rec.get('current_balance', 0.0) or 0.0)
+                except (TypeError, ValueError):
+                    bal = 0.0
+                pct = (bal / total_supply_dec) if total_supply_dec and total_supply_dec > 0 else 0.0
+                if addr and pct > 0.0:
+                    holder_entries_ts.append({'wallet': addr, 'holding_pct': pct})
+            holder_entries_ts.sort(key=lambda d: d['holding_pct'], reverse=True)
+            # Emit HolderSnapshot for this ts_value
+            hs_event = {
+                'event_type': 'HolderSnapshot',
+                'timestamp': int(ts_value),
+                'relative_ts': ts_value - t0_timestamp,
+                'holders': holder_entries_ts
+            }
+            _register_event_fn(hs_event, self._event_execution_sort_key(ts_value, signature='HolderSnapshot') if hasattr(self, '_event_execution_sort_key') else (ts_value, 0, 0, 0, 'HolderSnapshot'))
+            holder_pct_map_ts = {d['wallet']: d['holding_pct'] for d in holder_entries_ts}
+            top10_holder_pct = sum(d['holding_pct'] for d in holder_entries_ts[:10]) if holder_entries_ts else 0.0
+            # Cumulative sets up to ts_value
+            rat_set_ts = set(ev['destination_wallet_address'] for ev in transfer_events if ev['timestamp'] <= ts_value)
+            bundle_buyer_set_ts = set(e['wallet_address'] for e in trade_events if e.get('is_bundle') and e.get('trade_direction') == 0 and e.get('success', False) and e['timestamp'] <= ts_value)
+            buy_count = sum(1 for e in trades_win if e.get('trade_direction') == 0)
+            sell_count = sum(1 for e in trades_win if e.get('trade_direction') == 1)
+            volume = sum(float(e.get('total_usd', 0.0) or 0.0) for e in trades_win)
+            total_txns = len(trades_win) + len(xfers_win)
+            global_fees_paid = sum(float(e.get('priority_fee', 0.0) or 0.0) for e in trades_win) + \
+                               sum(float(e.get('priority_fee', 0.0) or 0.0) for e in xfers_win)
+            smart_trader_addrs = set(e['wallet_address'] for e in trades_win if e.get('event_type') == 'SmartWallet_Trade')
+            smart_traders = len(smart_trader_addrs)
+            kol_addrs = set()
+            for e in trades_win:
+                wa = e['wallet_address']
+                soc = wallet_data.get(wa, {}).get('socials', {})
+                if any(soc.get(k) for k in KOL_NAME_KEYS if soc):
+                    kol_addrs.add(wa)
+            kols = len(kol_addrs)
+            new_buyers = [e['wallet_address'] for e in trades_win if e.get('trade_direction') == 0 and e['wallet_address'] not in buyers_seen_global]
+            for wa in new_buyers:
+                buyers_seen_global.add(wa)
+            # Compute growth against previous snapshot endpoint.
+            end_dt = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
+            holders_end = self.fetcher.fetch_total_holders_count_for_token(token_address, end_dt)
+            total_holders = float(holders_end)
+            delta_holders = holders_end - prev_holders_count
+            holder_growth_rate = float(delta_holders)
+            prev_holders_count = holders_end
+            # Market cap from last price at or before ts
+            last_price_usd = 0.0
+            if agg_ts:
+                for i in range(len(agg_ts) - 1, -1, -1):
+                    if agg_ts[i] <= ts_value:
+                        last_price_usd = agg_price[i]
+                        break
+            current_market_cap = float(last_price_usd) * float(total_supply_dec)
+            oc_event = {
+                'event_type': 'OnChain_Snapshot',
+                'timestamp': int(ts_value),
+                'relative_ts': ts_value - t0_timestamp,
+                'total_holders': total_holders,
+                'smart_traders': float(smart_traders),
+                'kols': float(kols),
+                'holder_growth_rate': float(holder_growth_rate),
+                'top_10_holder_pct': float(top10_holder_pct),
+                'sniper_holding_pct': float(sum(holder_pct_map_ts.get(wa, 0.0) for wa in sniper_set)),
+                'rat_wallets_holding_pct': float(sum(holder_pct_map_ts.get(wa, 0.0) for wa in rat_set_ts)),
+                'bundle_holding_pct': float(sum(holder_pct_map_ts.get(wa, 0.0) for wa in bundle_buyer_set_ts)),
+                'current_market_cap': float(current_market_cap),
+                'volume': float(volume),
+                'buy_count': float(buy_count),
+                'sell_count': float(sell_count),
+                'total_txns': float(total_txns),
+                'global_fees_paid': float(global_fees_paid)
+            }
+            _register_event_fn(oc_event, self._event_execution_sort_key(ts_value, signature='OnChain_Snapshot') if hasattr(self, '_event_execution_sort_key') else (ts_value, 0, 0, 0, 'OnChain_Snapshot'))
+    def _calculate_deployed_token_stats(self, profiles: Dict[str, Dict[str, Any]], T_cutoff: datetime.datetime):
+        """
+        Calculates aggregate statistics for wallets based on the tokens they've deployed.
+        This method modifies the `profiles` dictionary in-place.
+        """
+        if not profiles: return
+        for addr, profile in profiles.items():
+            deployed_tokens = profile.get('deployed_tokens', [])
+            # 1. Deployed Tokens Count
+            count = len(deployed_tokens)
+            profile['deployed_tokens_count'] = float(count)
+            if count == 0:
+                profile['deployed_tokens_migrated_pct'] = 0.0
+                profile['deployed_tokens_avg_lifetime_sec'] = 0.0
+                profile['deployed_tokens_avg_peak_mc_usd'] = 0.0
+                profile['deployed_tokens_median_peak_mc_usd'] = 0.0
+                continue
+            # --- NEW: Fetch deployed token details with point-in-time logic ---
+            deployed_token_details = self.fetcher.fetch_deployed_token_details(deployed_tokens, T_cutoff)
+            # Collect stats for all deployed tokens of this wallet
+            lifetimes = []
+            peak_mcs = []
+            migrated_count = 0
+            for token_addr in deployed_tokens:
+                details = deployed_token_details.get(token_addr)
+                if not details: continue
+                if details.get('has_migrated'):
+                    migrated_count += 1
+                lifetimes.append((details['updated_at'] - details['created_at']).total_seconds())
+                peak_mcs.append(details.get('ath_price_usd', 0.0) * details.get('total_supply', 0.0) / (10**details.get('decimals', 9))) # Simplified MC
+            # 2. Migrated Pct
+            profile['deployed_tokens_migrated_pct'] = (migrated_count / count) if count > 0 else 0.0
+            # 3. Avg Lifetime
+            profile['deployed_tokens_avg_lifetime_sec'] = torch.mean(torch.tensor(lifetimes)).item() if lifetimes else 0.0
+            # 4. Avg & Median Peak MC
+            profile['deployed_tokens_avg_peak_mc_usd'] = torch.mean(torch.tensor(peak_mcs)).item() if peak_mcs else 0.0
+            profile['deployed_tokens_median_peak_mc_usd'] = torch.median(torch.tensor(peak_mcs)).item() if peak_mcs else 0.0
+    def _process_wallet_data(self, wallet_addresses: List[str], token_data: Dict[str, Any], pooler: EmbeddingPooler, T_cutoff: datetime.datetime) -> tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]:
+        """
+        Fetches and processes profile, social, and holdings data for a list of wallets.
+        Uses a T_cutoff to ensure data is point-in-time accurate.
+        """
+        if not wallet_addresses:
+            return {}, token_data
+        print(f"INFO: Processing wallet data for {len(wallet_addresses)} unique wallets...")
+        # Bulk fetch all data
+        profiles, socials = self.fetcher.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
+        holdings = self.fetcher.fetch_wallet_holdings(wallet_addresses, T_cutoff)
+        valid_wallets = [addr for addr in wallet_addresses if addr in profiles]
+        dropped_wallets = set(wallet_addresses) - set(valid_wallets)
+        if dropped_wallets:
+            print(f"INFO: Skipping {len(dropped_wallets)} wallets with no profile before cutoff.")
+        if not valid_wallets:
+            print("INFO: All wallets were graph-only or appeared after cutoff; skipping wallet processing for this token.")
+            return {}, token_data
+        wallet_addresses = valid_wallets
+        # --- NEW: Collect all unique mints from holdings to fetch their data ---
+        all_holding_mints = set()
+        for wallet_addr in wallet_addresses:
+            for holding_item in holdings.get(wallet_addr, []):
+                if 'mint_address' in holding_item:
+                    all_holding_mints.add(holding_item['mint_address'])
+        # --- NEW: Process all discovered tokens with point-in-time logic ---
+        # 1. Fetch raw data for all newly found tokens from holdings.
+        # 2. Process this raw data to get embedding indices and add to the pooler.
+        #    Note: _process_token_data is designed to take a list and return a dict.
+        #    We pass the addresses and let it handle the fetching and processing internally.
+        processed_new_tokens = self._process_token_data(list(all_holding_mints), pooler, T_cutoff)
+        # 3. Merge the fully processed new tokens with the existing main token data.
+        all_token_data = {**token_data, **(processed_new_tokens or {})}
+        # --- NEW: Calculate deployed token stats using point-in-time logic ---
+        self._calculate_deployed_token_stats(profiles, T_cutoff)
+        # --- Assemble the final wallet dictionary ---
+        # This structure is exactly what the WalletEncoder expects.
+        final_wallets = {}
+        for addr in wallet_addresses:
+            # --- Define all expected numerical keys for a profile ---
+            # This prevents KeyErrors if the DB returns a partial profile.
+            expected_profile_keys = [
+                'age', 'deployed_tokens_count', 'deployed_tokens_migrated_pct',
+                'deployed_tokens_avg_lifetime_sec', 'deployed_tokens_avg_peak_mc_usd',
+                'deployed_tokens_median_peak_mc_usd', 'balance', 'transfers_in_count',
+                'transfers_out_count', 'spl_transfers_in_count', 'spl_transfers_out_count',
+                'total_buys_count', 'total_sells_count', 'total_winrate',
+                'stats_1d_realized_profit_sol', 'stats_1d_realized_profit_pnl', 'stats_1d_buy_count',
+                'stats_1d_sell_count', 'stats_1d_transfer_in_count', 'stats_1d_transfer_out_count',
+                'stats_1d_avg_holding_period', 'stats_1d_total_bought_cost_sol', 'stats_1d_total_sold_income_sol',
+                'stats_1d_total_fee', 'stats_1d_winrate', 'stats_1d_tokens_traded',
+                'stats_7d_realized_profit_sol', 'stats_7d_realized_profit_pnl', 'stats_7d_buy_count', 'stats_7d_sell_count', 'stats_7d_transfer_in_count', 'stats_7d_transfer_out_count', 'stats_7d_avg_holding_period', 'stats_7d_total_bought_cost_sol', 'stats_7d_total_sold_income_sol', 'stats_7d_total_fee', 'stats_7d_winrate', 'stats_7d_tokens_traded'
+            ]
+            # --- FIXED: Use .get() and provide a default empty dict if not found ---
+            # --- NEW: If a wallet profile doesn't exist in the DB, skip it entirely. ---
+            # This removes the old logic that created a placeholder profile with zeroed-out features.
+            # "If it doesn't exist, it doesn't exist."
+            profile_data = profiles.get(addr, None)
+            if not profile_data:
+                print(f"INFO: Wallet {addr} found in graph but has no profile in DB. Skipping this wallet.")
+                continue
+            # --- NEW: Ensure all expected keys exist in the fetched profile ---
+            for key in expected_profile_keys:
+                profile_data.setdefault(key, 0.0) # Use 0.0 as a safe default for any missing numerical key
+            social_data = socials.get(addr, {})
+            # --- NEW: Derive boolean social flags based on schema ---
+            social_data['has_pf_profile'] = bool(social_data.get('pumpfun_username'))
+            social_data['has_twitter'] = bool(social_data.get('twitter_username'))
+            social_data['has_telegram'] = bool(social_data.get('telegram_channel'))
+            # 'is_exchange_wallet' is not in the schema, so we'll default to False for now.
+            # This is a feature that would likely come from a 'tags' column or a separate service.
+            social_data['is_exchange_wallet'] = 'exchange_wallet' in profile_data.get('tags', [])
+            # --- NEW: Calculate 'age' based on user's logic ---
+            funded_ts = profile_data.get('funded_timestamp', 0)
+            if funded_ts and funded_ts > 0:
+                # Calculate age in seconds from the funding timestamp
+                age_seconds = int(datetime.datetime.now(datetime.timezone.utc).timestamp()) - funded_ts
+            else:
+                # Fallback for wallets older than our DB window, as requested
+                # 5 months * 30 days/month * 24 hours/day * 3600 seconds/hour
+                age_seconds = 12_960_000
+            # Add the calculated age to the profile data that the WalletEncoder will receive
+            profile_data['age'] = float(age_seconds)
+            # Get the username and add it to the embedding pooler
+            username = social_data.get('pumpfun_username') or social_data.get('twitter_username') or social_data.get('kolscan_name')
+            if isinstance(username, str) and username.strip():
+                social_data['username_emb_idx'] = pooler.get_idx(username.strip())
+            else:
+                social_data['username_emb_idx'] = 0  # means "no embedding"
+            # --- NEW: Filter holdings and calculate derived features ---
+            # We create a new list `valid_wallet_holdings` to ensure that if a holding's
+            # token is invalid (filtered out by _process_token_data), the entire holding
+            # row is removed and not passed to the WalletEncoder.
+            original_holdings = holdings.get(addr, [])
+            valid_wallet_holdings = []
+            now_ts = datetime.datetime.now(datetime.timezone.utc)
+            for holding_item in original_holdings:
+                # 1. Calculate holding_time
+                start_ts = holding_item.get('start_holding_at')
+                mint_addr = holding_item.get('mint_address')
+                token_info = all_token_data.get(mint_addr)
+                if not token_info:
+                    print(f"INFO: Skipping holding for token {mint_addr} in wallet {addr} because token data is invalid/missing.")
+                    continue
+                end_ts = holding_item.get('end_holding_at')
+                if not start_ts:
+                    holding_item['holding_time'] = 0.0
+                else:
+                    end_ts = end_ts or now_ts
+                    holding_item['holding_time'] = (end_ts - start_ts).total_seconds()
+                # 2. Calculate balance_pct_to_supply
+                if token_info and token_info.get('total_supply', 0) > 0:
+                    total_supply = token_info['total_supply'] / (10**token_info.get('decimals', 9))
+                    current_balance = holding_item.get('current_balance', 0.0)
+                    holding_item['balance_pct_to_supply'] = (current_balance / total_supply) if total_supply > 0 else 0.0
+                else:
+                    holding_item['balance_pct_to_supply'] = 0.0
+                # 3. --- NEW: Calculate bought_amount_sol_pct_to_native_balance ---
+                # This uses the historically accurate native balance from the profile.
+                wallet_native_balance = profile_data.get('balance', 0.0)
+                bought_cost_sol = holding_item.get('history_bought_cost_sol', 0.0)
+                if wallet_native_balance > 1e-9: # Use a small epsilon to avoid division by zero
+                    holding_item['bought_amount_sol_pct_to_native_balance'] = bought_cost_sol / wallet_native_balance
+                else:
+                    holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
+                valid_wallet_holdings.append(holding_item)
+            final_wallets[addr] = {
+                'profile': profile_data,
+                'socials': social_data,
+                'holdings': valid_wallet_holdings
+            }
+        return final_wallets, all_token_data
+    def _process_token_data(self, token_addresses: List[str], pooler: EmbeddingPooler, T_cutoff: datetime.datetime, token_data: Optional[Dict] = None) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches and processes static data for a list of tokens.
+        """
+        if not token_addresses:
+            return {}
+        if token_data is None:
+            print(f"INFO: Processing token data for {len(token_addresses)} unique tokens...")
+            token_data = self.fetcher.fetch_token_data(token_addresses, T_cutoff)
+        # --- NEW: Print the raw fetched token data as requested ---
+        print("\n--- RAW TOKEN DATA FROM DATABASE ---")
+        print(token_data)
+        # Add pre-computed embedding indices to the token data
+        # --- CRITICAL FIX: This function now returns None if the main token is invalid ---
+        valid_token_data = {}
+        for addr, data in token_data.items():
+            # --- FIXED: Only add to pooler if data is valid ---
+            image = None
+            token_uri = data.get('token_uri')
+            # --- NEW: Use multiple IPFS gateways for reliability ---
+            if token_uri and isinstance(token_uri, str) and token_uri.strip():
+                ipfs_gateways = [
+                    "https://pump.mypinata.cloud/ipfs/",
+                    "https://dweb.link/ipfs/",
+                    "https://cloudflare-ipfs.com/ipfs/",
+                ]
+                try:
+                    # Handle IPFS URIs for metadata
+                    if 'ipfs/' in token_uri:
+                        metadata_hash = token_uri.split('ipfs/')[-1]
+                        # Try fetching from multiple gateways
+                        for gateway in ipfs_gateways:
+                            try:
+                                metadata_resp = self.http_session.get(f"{gateway}{metadata_hash}", timeout=5)
+                                metadata_resp.raise_for_status()
+                                metadata = metadata_resp.json()
+                                break # Success, exit loop
+                            except requests.RequestException:
+                                continue # Try next gateway
+                        else: # If all gateways fail
+                            raise requests.RequestException("All IPFS gateways failed for metadata.")
+                    else: # Handle regular HTTP URIs
+                        metadata_resp = self.http_session.get(token_uri, timeout=5)
+                        metadata_resp.raise_for_status()
+                        metadata = metadata_resp.json()
+                    # 1. Fetch metadata JSON from token_uri
+                    image_url = metadata.get('image', '')
+                    # --- FIXED: Apply the same multi-gateway logic to image fetching ---
+                    if image_url:
+                        # Handle IPFS URIs for the image
+                        if 'ipfs/' in image_url:
+                            image_hash = image_url.split('ipfs/')[-1]
+                            # Try fetching image from multiple gateways
+                            for gateway in ipfs_gateways:
+                                try:
+                                    image_resp = self.http_session.get(f"{gateway}{image_hash}", timeout=10)
+                                    image_resp.raise_for_status()
+                                    image = Image.open(BytesIO(image_resp.content))
+                                    break # Success, exit loop
+                                except requests.RequestException:
+                                    continue # Try next gateway
+                            else: # If all gateways fail for the image
+                                raise requests.RequestException("All IPFS gateways failed for image.")
+                        else: # Handle regular HTTP image URLs
+                            image_resp = self.http_session.get(image_url, timeout=10)
+                            image_resp.raise_for_status()
+                            image = Image.open(BytesIO(image_resp.content))
+                except (requests.RequestException, ValueError, IOError) as e:
+                    print(f"WARN: Could not fetch or process image for token {addr} from URI {token_uri}. Reason: {e}")
+                    image = None # Ensure image is None on failure
+            # --- FIXED: Check for valid metadata before adding to pooler ---
+            token_name = data.get('name') if data.get('name') and data.get('name').strip() else None
+            token_symbol = data.get('symbol') if data.get('symbol') and data.get('symbol').strip() else None
+            # --- IMAGE IS A FUCKING MUST
+            # --- FIXED: Correctly handle invalid secondary tokens without aborting the whole process ---
+            if not token_name or not token_symbol or not image:
+                if not token_name: reason = "name"
+                elif not token_symbol: reason = "symbol"
+                else: reason = "image (fetch failed)"
+                print(f"WARN: Token {addr} is missing essential metadata ('{reason}'). This token will be skipped.")
+                # If this function was called with only one token, it's the main token.
+                # If the main token is invalid, the whole sample is invalid, so return None.
+                if len(token_addresses) == 1:
+                    return None
+                # Otherwise, it's a secondary token. Skip it and continue with the others.
+                continue
+            # --- NEW: Add is_vanity feature based on the token address ---
+            data['is_vanity'] = addr.lower().endswith("pump")
+            data['image_emb_idx'] = pooler.get_idx(image)
+            data['name_emb_idx'] = pooler.get_idx(token_name)
+            data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
+            # FIX: Validate the protocol ID ---
+            # The DB might return an ID that is out of bounds for our nn.Embedding layer.
+            # We must ensure the ID is valid or map it to a default 'Unknown' ID.
+            raw_protocol_id = data.get('protocol')
+            if raw_protocol_id is not None and 0 <= raw_protocol_id < vocab.NUM_PROTOCOLS:
+                data['protocol'] = raw_protocol_id
+            else:
+                data['protocol'] = vocab.PROTOCOL_TO_ID.get('Unknown', 0)
+            valid_token_data[addr] = data
+        return valid_token_data
+    def _generate_ohlc(self, aggregation_trades: List[Dict[str, Any]], T_cutoff: datetime.datetime, interval_seconds: int) -> List[tuple]:
+        """
+        Generates an OHLC series from a list of aggregated trades with a dynamic interval.
+        It forward-fills gaps and extends the series up to T_cutoff.
+        Returns a list of (timestamp, open, close) tuples.
+        """
+        if not aggregation_trades:
+            return []
+        trades_by_interval = defaultdict(list)
+        for trade in aggregation_trades:
+            # Group trades into interval buckets
+            interval_start_ts = (trade['timestamp'] // interval_seconds) * interval_seconds
+            trades_by_interval[interval_start_ts].append(trade['price_usd'])
+        sorted_intervals = sorted(trades_by_interval.keys())
+        if not sorted_intervals:
+            return []
+        full_ohlc = []
+        start_ts = sorted_intervals[0]
+        end_ts = int(T_cutoff.timestamp())
+        # Align end_ts to the interval grid
+        end_ts = (end_ts // interval_seconds) * interval_seconds
+        last_price = aggregation_trades[0]['price_usd']
+        # --- NEW: Debugging log for trades grouped by interval ---
+        print(f"\n[DEBUG] OHLC Generation: Trades grouped by interval bucket:")
+        print(dict(trades_by_interval))
+        for ts in range(start_ts, end_ts + 1, interval_seconds):
+            if ts in trades_by_interval:
+                prices = trades_by_interval[ts]
+                open_price = prices[0]
+                close_price = prices[-1]
+                full_ohlc.append((ts, open_price, close_price))
+                last_price = close_price
+            else:
+                full_ohlc.append((ts, last_price, last_price))
+        return full_ohlc
+    def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
+        """
+        Loads a pre-processed data item from the cache, or generates it on-the-fly
+        if the dataset is in online mode.
+        """
+        if self.cache_dir:
+            if idx >= len(self.cached_files):
+                raise IndexError(f"Index {idx} out of range for {len(self.cached_files)} cached files.")
+            filepath = self.cached_files[idx]
+            try:
+                # Use map_location to avoid issues if cached on GPU and loading on CPU
+                return torch.load(filepath, map_location='cpu')
+            except Exception as e:
+                print(f"ERROR: Could not load or process cached item {filepath}: {e}")
+                return None  # DataLoader can be configured to skip None items
+        # Fallback to online generation if no cache_dir is set
+        return self.__cacheitem__(idx)
+    def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
+        """
+        The main data loading method. For a given token, it fetches all
+        relevant on-chain and off-chain data, processes it, and returns
+        a structured dictionary for the collator.
+        """
+        if not self.sampled_mints:
+            raise RuntimeError("Dataset has no mint records loaded; ensure fetcher returned data during initialization.")
+        if idx >= len(self.sampled_mints):
+            raise IndexError(f"Requested sample index {idx} exceeds loaded mint count {len(self.sampled_mints)}.")
+        initial_mint_record = self.sampled_mints[idx]
+        t0 = initial_mint_record["timestamp"]
+        creator_address = initial_mint_record['creator_address']
+        token_address = initial_mint_record['mint_address']
+        print(f"\n--- Building dataset for token: {token_address} ---")
+        # The EmbeddingPooler is crucial for collecting unique text/images per sample
+        pooler = EmbeddingPooler()
+        def _safe_int(value: Any) -> int:
+            try:
+                return int(value)
+            except (TypeError, ValueError):
+                return 0
+        def _timestamp_to_order_value(ts_value: Any) -> float:
+            if isinstance(ts_value, datetime.datetime):
+                if ts_value.tzinfo is None:
+                    ts_value = ts_value.replace(tzinfo=datetime.timezone.utc)
+                return ts_value.timestamp()
+            try:
+                return float(ts_value)
+            except (TypeError, ValueError):
+                return 0.0
+        def _event_execution_sort_key(timestamp_value: Any,
+                                      slot: Any = 0,
+                                      transaction_index: Any = 0,
+                                      instruction_index: Any = 0,
+                                      signature: str = '') -> tuple:
+            return (
+                _timestamp_to_order_value(timestamp_value),
+                _safe_int(slot),
+                _safe_int(transaction_index),
+                _safe_int(instruction_index),
+                signature or ''
+            )
+        # 1. Fetch anchor Mint event to establish the timeline & initial entities
+        # --- SIMPLIFIED: Use the mint record we already have ---
+        mint_event = {
+            'event_type': 'Mint',
+            'timestamp': int(initial_mint_record['timestamp'].timestamp()),
+            'relative_ts': 0,
+            'wallet_address': initial_mint_record['creator_address'],
+            'token_address': token_address,
+            'protocol_id': initial_mint_record.get('protocol')
+        }
+        initial_entities = {mint_event['wallet_address']}
+        event_sequence_entries: List[Tuple[tuple, Dict[str, Any]]] = []
+        def _register_event(event: Dict[str, Any], sort_key: tuple):
+            event_sequence_entries.append((sort_key, event))
+        _register_event(mint_event, _event_execution_sort_key(mint_event['timestamp'], signature='Mint'))
+        # Determine the cutoff time for all historical data fetching
+        # T_cutoff = datetime.datetime.fromtimestamp(event_sequence[-1]['timestamp'], tz=datetime.timezone.utc)
+        # --- MODIFIED: Set T_cutoff to mint timestamp + 1 day ---
+        T_cutoff = initial_mint_record['timestamp'] + datetime.timedelta(seconds=self.t_cutoff_seconds)
+        max_horizon_seconds = max(self.horizons_seconds) if self.horizons_seconds else 0
+        future_trades_for_labels: List[Dict[str, Any]] = []
+        if self.num_outputs > 0 and max_horizon_seconds > 0:
+            future_window_end = T_cutoff + datetime.timedelta(seconds=max_horizon_seconds)
+            future_trades_for_labels = self.fetcher.fetch_future_trades_for_token(
+                token_address, T_cutoff, future_window_end
+            )
+            if not future_trades_for_labels:
+                print(f"INFO: Skipping token {token_address} (no future trades beyond cutoff).")
+                return None
+        # --- NEW: Accumulate all wallets before hitting Neo4j to avoid duplicate queries ---
+        graph_seed_entities = set(initial_entities)
+        all_graph_entities: Dict[str, str] = {mint_event['wallet_address']: 'Wallet'}
+        all_graph_entity_addrs = set(all_graph_entities.keys())
+        graph_links: Dict[str, Any] = {}
+        # 3. Fetch trades and add traders to the entity set
+        # --- REFACTORED: Fetch trades using the new 3-part HBH system ---
+        early_trades, middle_trades, recent_trades = self.fetcher.fetch_trades_for_token(
+            token_address, T_cutoff, EVENT_COUNT_THRESHOLD_FOR_HBH, HBH_EARLY_EVENT_LIMIT, HBH_RECENT_EVENT_LIMIT
+        )
+        def _trade_execution_sort_key(trade: Dict[str, Any]) -> tuple:
+            return (
+                _timestamp_to_order_value(trade.get('timestamp')),
+                _safe_int(trade.get('slot')),
+                _safe_int(trade.get('transaction_index')),
+                _safe_int(trade.get('instruction_index')),
+                trade.get('signature', '')
+            )
+        early_trades = sorted(early_trades, key=_trade_execution_sort_key)
+        middle_trades = sorted(middle_trades, key=_trade_execution_sort_key)
+        recent_trades = sorted(recent_trades, key=_trade_execution_sort_key)
+        # --- NEW: Inject special context tokens to mark HBH boundaries ---
+        # 'Middle' marks the start of the blurry middle window
+        if middle_trades:
+            mid_ts_val = _timestamp_to_order_value(middle_trades[0].get('timestamp'))
+            middle_event = {
+                'event_type': 'Middle',
+                'timestamp': int(mid_ts_val),
+                'relative_ts': mid_ts_val - _timestamp_to_order_value(t0)
+            }
+            _register_event(middle_event, _event_execution_sort_key(mid_ts_val, signature='Middle'))
+        # 'RECENT' marks the start of the high-definition recent window
+        if recent_trades:
+            rec_ts_val = _timestamp_to_order_value(recent_trades[0].get('timestamp'))
+            recent_event = {
+                'event_type': 'RECENT',
+                'timestamp': int(rec_ts_val),
+                'relative_ts': rec_ts_val - _timestamp_to_order_value(t0)
+            }
+            _register_event(recent_event, _event_execution_sort_key(rec_ts_val, signature='RECENT'))
+        # For now, we only process the high-definition segments for event creation,
+        # deduplicated in case of overlap between early/recent slices.
+        trade_records = []
+        seen_trade_keys = set()
+        for trade in early_trades + recent_trades:
+            dedupe_key = (
+                _safe_int(trade.get('slot')),
+                _safe_int(trade.get('transaction_index')),
+                _safe_int(trade.get('instruction_index')),
+                trade.get('signature', '')
+            )
+            if dedupe_key in seen_trade_keys:
+                continue
+            seen_trade_keys.add(dedupe_key)
+            trade_records.append(trade)
+        for trade in trade_records:
+            trader_addr = trade['maker']
+            if trader_addr not in all_graph_entity_addrs:
+                all_graph_entity_addrs.add(trader_addr)
+                all_graph_entities[trader_addr] = 'Wallet' # Trades are always made by wallets
+                graph_seed_entities.add(trader_addr)
+        # --- REFACTORED: Fetch significant transfers, passing total supply for filtering ---
+        raw_total_supply = initial_mint_record.get('total_supply', 0)
+        base_decimals = initial_mint_record.get('token_decimals', 9)
+        total_supply_dec = (raw_total_supply / (10**base_decimals)) if base_decimals > 0 else raw_total_supply
+                # Calculate the minimum amount to be considered a significant transfer
+        total_supply_dec = total_supply_dec * MIN_AMOUNT_TRANSFER_SUPPLY # 0.01% of total supply
+        transfer_records = self.fetcher.fetch_transfers_for_token(token_address, T_cutoff, total_supply_dec)
+        for transfer in transfer_records:
+            src = transfer.get('source')
+            dst = transfer.get('destination')
+            if src:
+                all_graph_entities[src] = 'Wallet'
+                graph_seed_entities.add(src)
+            if dst:
+                all_graph_entities[dst] = 'Wallet'
+                graph_seed_entities.add(dst)
+        # --- NEW: Fetch pool creation events to enrich entity set and token list ---
+        pool_creation_records = self.fetcher.fetch_pool_creations_for_token(token_address, T_cutoff)
+        pool_quote_addresses = set()
+        pool_metadata_by_address: Dict[str, Dict[str, Any]] = {}
+        for pool_record in pool_creation_records:
+            creator_addr = pool_record.get('creator_address')
+            if creator_addr:
+                all_graph_entities[creator_addr] = 'Wallet'
+                graph_seed_entities.add(creator_addr)
+            quote_addr = pool_record.get('quote_address')
+            if quote_addr:
+                pool_quote_addresses.add(quote_addr)
+                # Mark discovered quote tokens so they can be fetched later if needed
+                all_graph_entities.setdefault(quote_addr, 'Token')
+            pool_addr = pool_record.get('pool_address')
+            if pool_addr:
+                pool_metadata_by_address[pool_addr] = {
+                    'quote_token_address': quote_addr,
+                    'quote_decimals': pool_record.get('quote_decimals'),
+                    'base_decimals': pool_record.get('base_decimals')
+                }
+        liquidity_change_records = self.fetcher.fetch_liquidity_changes_for_pools(list(pool_metadata_by_address.keys()), T_cutoff)
+        for liquidity_record in liquidity_change_records:
+            lp_provider = liquidity_record.get('lp_provider')
+            if lp_provider:
+                all_graph_entities[lp_provider] = 'Wallet'
+                graph_seed_entities.add(lp_provider)
+        fee_collection_records = self.fetcher.fetch_fee_collections_for_token(token_address, T_cutoff)
+        burn_records = self.fetcher.fetch_burns_for_token(token_address, T_cutoff)
+        supply_lock_records = self.fetcher.fetch_supply_locks_for_token(token_address, T_cutoff)
+        migration_records = self.fetcher.fetch_migrations_for_token(token_address, T_cutoff)
+        # NEW: Fetch top holders to include their wallets so we can embed them
+        holder_records = self.fetcher.fetch_token_holders_for_snapshot(token_address, T_cutoff, limit=HOLDER_SNAPSHOT_TOP_K)
+        fee_related_mints = set()
+        for fee_record in fee_collection_records:
+            recipient = fee_record.get('recipient_address')
+            if recipient:
+                all_graph_entities[recipient] = 'Wallet'
+                graph_seed_entities.add(recipient)
+                mint_addr = fee_record.get('token_0_mint_address')
+                if mint_addr and mint_addr not in (token_address, ''):
+                    fee_related_mints.add(mint_addr)
+        # Include migration pool addresses as tokens/entities if present
+        for mig in migration_records:
+            vpool = mig.get('virtual_pool_address')
+            paddr = mig.get('pool_address')
+            if vpool:
+                all_graph_entities.setdefault(vpool, 'Token')
+            if paddr:
+                all_graph_entities.setdefault(paddr, 'Token')
+        # Include burner wallets in entity set
+        for burn in burn_records:
+            src = burn.get('source')
+            if src:
+                all_graph_entities[src] = 'Wallet'
+                graph_seed_entities.add(src)
+        # Include holder wallets in entity set for embedding availability
+        for rec in holder_records:
+            wa = rec.get('wallet_address')
+            if wa:
+                all_graph_entities[wa] = 'Wallet'
+                graph_seed_entities.add(wa)
+        # Include lockers in entity set
+        for lock in supply_lock_records:
+            sender = lock.get('sender')
+            recipient = lock.get('recipient')
+            if sender:
+                all_graph_entities[sender] = 'Wallet'
+                graph_seed_entities.add(sender)
+            if recipient:
+                all_graph_entities[recipient] = 'Wallet'
+                graph_seed_entities.add(recipient)
+        # --- NEW: Now that all wallets are known, fetch graph links once ---
+        if graph_seed_entities:
+            fetched_graph_entities, graph_links = self.fetcher.fetch_graph_links(
+                list(graph_seed_entities),
+                T_cutoff=T_cutoff,
+                max_degrees=2
+            )
+            for addr, entity_type in fetched_graph_entities.items():
+                all_graph_entities[addr] = entity_type
+            all_graph_entity_addrs = set(all_graph_entities.keys())
+        # 4. Fetch and process static data for the main token
+        tokens_to_fetch = [token_address]
+        for quote_addr in pool_quote_addresses:
+            if quote_addr and quote_addr not in tokens_to_fetch:
+                tokens_to_fetch.append(quote_addr)
+        for mint_addr in fee_related_mints:
+            if mint_addr and mint_addr not in tokens_to_fetch:
+                tokens_to_fetch.append(mint_addr)
+        main_metadata = {}
+        main_metadata[token_address] = {
+            'name': initial_mint_record["token_name"],
+            'symbol':  initial_mint_record["token_symbol"],
+            'token_uri':   initial_mint_record["token_uri"],
+            'protocol': initial_mint_record["protocol"],
+            'total_supply':  initial_mint_record["total_supply"],
+            'decimals': initial_mint_record["token_decimals"],
+            'address': token_address
+        }
+        main_token_data = self._process_token_data(tokens_to_fetch, pooler, T_cutoff, main_metadata)
+        # --- CRITICAL FIX: If the main token is invalid, skip this entire sample ---
+        if not main_token_data:
+            return None # The specific reason is already logged in _process_token_data
+        # 5. Fetch and process data for ALL wallets discovered (from mint, graph, trades, etc.)
+        # --- FIXED: Correctly identify wallets using their entity type from the graph ---
+        wallets_to_fetch = [addr for addr, type in all_graph_entities.items() if type == 'Wallet']
+        # Also include traders from trades, even if they weren't in the graph
+        wallets_to_fetch.extend([trade['maker'] for trade in trade_records if trade['maker'] not in wallets_to_fetch])
+        wallet_data, all_token_data = self._process_wallet_data(list(set(wallets_to_fetch)), main_token_data.copy(), pooler, T_cutoff)
+        # 6. Process trades into event format using the now-available wallet_data
+        trade_events = []
+        aggregation_trades = []
+        high_def_chart_trades = []  # Early + recent windows use 1s candles
+        middle_chart_trades = []    # Middle window uses 30s candles
+        # --- FIXED: Get main token decimals once before the loop ---
+        main_token_info = main_token_data[token_address]
+        base_decimals = main_token_info.get('decimals', 6)
+        # --- FIXED: Get total_supply directly from the initial mint record ---
+        raw_total_supply = initial_mint_record.get('total_supply', 0)
+        total_supply_dec = (raw_total_supply / (10**base_decimals)) if base_decimals > 0 else raw_total_supply
+        print("SUPPLY", total_supply_dec)
+        t0_timestamp = _timestamp_to_order_value(t0)
+        for trade in trade_records:
+            # --- NEW: Filter out trades with low USD value ---
+            # This applies to both event creation and chart aggregation.
+            if trade.get('total_usd', 0.0) < self.min_trade_usd:
+                continue
+            trade_sort_key = _trade_execution_sort_key(trade)
+            trade_timestamp = trade.get('timestamp')
+            trade_timestamp_value = _timestamp_to_order_value(trade_timestamp)
+            trade_timestamp_int = int(trade_timestamp_value)
+            # --- NEW: Determine event type with priority ---
+            trader_addr = trade['maker']
+            trader_wallet_data = wallet_data.get(trader_addr, {})
+            trader_profile = trader_wallet_data.get('profile', {})
+            trader_socials = trader_wallet_data.get('socials', {})
+            KOL_NAME_KEYS = ['kolscan_name', 'cabalspy_name', 'axiom_kol_name']
+            is_kol = any(trader_socials.get(key) for key in KOL_NAME_KEYS if trader_socials)
+            is_profitable = (trader_profile.get('stats_30d_realized_profit_pnl', 0.0) > SMART_WALLET_PNL_THRESHOLD and
+                             trader_profile.get('stats_30d_realized_profit_usd', 0.0) > SMART_WALLET_USD_THRESHOLD)
+            base_amount_dec = trade.get('base_amount', 0) / (10**base_decimals)
+            is_large_amount = (total_supply_dec > 0 and (base_amount_dec / total_supply_dec) > LARGE_TRADE_SUPPLY_PCT_THRESHOLD)
+            if trader_addr == creator_address:
+                event_type = 'Deployer_Trade'
+            elif is_kol or is_profitable:
+                event_type = 'SmartWallet_Trade'
+            elif trade.get('total_usd', 0.0) > LARGE_TRADE_USD_THRESHOLD or is_large_amount:
+                event_type = 'LargeTrade'
+            else:
+                event_type = 'Trade'
+            # --- NEW: Get token decimals for accurate calculations ---
+            quote_address = trade.get('quote_address')
+            quote_decimals = QUOTE_TOKEN_DECIMALS.get(quote_address, 9) # Default to 9 for SOL
+            quote_amount_dec = trade.get('quote_amount', 0) / (10**quote_decimals)
+            # --- NEW: Correctly calculate pre-trade balances ---
+            is_sell = trade.get('trade_type') == 1
+            # If it's a sell, the pre-trade base balance was higher.
+            pre_trade_base_balance = (trade.get('base_balance', 0.0) + base_amount_dec) if is_sell else trade.get('base_balance', 0.0)
+            # If it's a buy, the pre-trade quote balance was higher.
+            pre_trade_quote_balance = (trade.get('quote_balance', 0.0) + quote_amount_dec) if not is_sell else trade.get('quote_balance', 0.0)
+            # --- NEW: Calculate percentage features with the corrected values ---
+            token_amount_pct = (base_amount_dec / pre_trade_base_balance) if pre_trade_base_balance > 1e-9 else 1.0
+            quote_amount_pct = (quote_amount_dec / pre_trade_quote_balance) if pre_trade_quote_balance > 1e-9 else 1.0
+            is_success = trade.get('success', False)
+            if is_success:
+                chart_entry = {
+                    'trade_direction': 1 if is_sell else 0, # 1 for sell, 0 for buy,
+                    'price_usd': trade.get('price_usd', 0.0),
+                    'timestamp': trade_timestamp_int,
+                    'sort_key': trade_sort_key,
+                }
+                aggregation_trades.append(chart_entry)
+                high_def_chart_trades.append(chart_entry.copy())
+            # --- NEW: Calculate token amount as a percentage of total supply ---
+            token_amount_pct_of_supply = (base_amount_dec / total_supply_dec) if total_supply_dec > 0 else 0.0
+            trade_event = {
+                'event_type': event_type,
+                'timestamp': trade_timestamp_int,
+                'relative_ts': trade_timestamp_value - t0_timestamp,
+                'wallet_address': trade['maker'],
+                'token_address': token_address,
+                'trade_direction': 1 if is_sell else 0, # 1 for sell, 0 for buy
+                'sol_amount': trade.get('total', 0.0), # Assuming 'total' is the SOL amount
+                'dex_platform_id': trade.get('platform', 0),
+                'priority_fee': trade.get('priority_fee', 0.0),
+                'mev_protection': 1 if trade.get('mev_protection', 0) > 0 else 0, # Convert to binary: 0 for False, 1 for True
+                # --- FIXED: Use the new, correct percentage calculations ---
+                'token_amount_pct_of_holding': token_amount_pct,
+                'quote_amount_pct_of_holding': quote_amount_pct,
+                'slippage': trade.get('slippage', 0.0),
+                'token_amount_pct_to_total_supply': token_amount_pct_of_supply, # FIXED: Replaced price_impact
+                'success': is_success,
+                'is_bundle': False, # Default to False, will be updated below
+                'total_usd': trade.get('total_usd', 0.0)
+            }
+            trade_events.append(trade_event)
+            _register_event(trade_event, trade_sort_key)
+        for trade in middle_trades:
+            # --- NEW: Filter out trades with low USD value from chart aggregation ---
+            if trade.get('total_usd', 0.0) < self.min_trade_usd:
+                continue
+            # --- NEW: Correctly calculate pre-trade balances ---
+            is_sell = trade.get('trade_type') == 1
+            chart_entry = {
+                'trade_direction': 1 if is_sell else 0, # 1 for sell, 0 for buy,
+                'price_usd': trade.get('price_usd', 0.0),
+                'timestamp': int(_timestamp_to_order_value(trade.get('timestamp'))),
+                'sort_key': _trade_execution_sort_key(trade),
+            }
+            aggregation_trades.append(chart_entry)
+            middle_chart_trades.append(chart_entry.copy())
+        def _finalize_chart_trade_list(trade_list: List[Dict[str, Any]]):
+            trade_list.sort(key=lambda x: x['sort_key'])
+            for entry in trade_list:
+                entry.pop('sort_key', None)
+        _finalize_chart_trade_list(aggregation_trades)
+        _finalize_chart_trade_list(high_def_chart_trades)
+        _finalize_chart_trade_list(middle_chart_trades)
+        # --- NEW: Debugging log for all trades used in chart generation ---
+        print(f"\n[DEBUG] Total aggregated trades for OHLC: {len(aggregation_trades)}")
+        if aggregation_trades:
+            print("[DEBUG] First 5 aggregated trades:", aggregation_trades[:5])
+        HIGH_DEF_INTERVAL = ("1s", 1)
+        MIDDLE_INTERVAL = ("30s", 30)
+        def _emit_chart_segments(trades: List[Dict[str, Any]], interval: tuple, signature_prefix: str):
+            if not trades:
+                return []
+            interval_label, interval_seconds = interval
+            ohlc_series = self._generate_ohlc(trades, T_cutoff, interval_seconds)
+            print(f"[DEBUG] Generated OHLC series ({interval_label}) with {len(ohlc_series)} candles. First 5: {ohlc_series[:5]}")
+            emitted_events = []
+            for idx in range(0, len(ohlc_series), OHLC_SEQ_LEN):
+                segment = ohlc_series[idx:idx + OHLC_SEQ_LEN]
+                if not segment:
+                    continue
+                last_ts = segment[-1][0]
+                opens_raw = [s[1] for s in segment]
+                closes_raw = [s[2] for s in segment]
+                chart_event = {
+                    'event_type': 'Chart_Segment',
+                    'timestamp': last_ts,
+                    'relative_ts': last_ts - t0_timestamp,
+                    'opens': self._normalize_price_series(opens_raw),
+                    'closes': self._normalize_price_series(closes_raw),
+                    'i': interval_label
+                }
+                emitted_events.append(chart_event)
+                _register_event(chart_event, _event_execution_sort_key(last_ts, signature=f"{signature_prefix}-{idx}"))
+            return emitted_events
+        # --- NEW: Generate Chart_Segment events from aggregated trades ---
+        chart_events = []
+        chart_events.extend(_emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL, "chart-hd"))
+        chart_events.extend(_emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL, "chart-mid"))
+        # --- NEW: Convert pool creation records into structured events ---
+        SOL_MINT_ADDRESS = 'So11111111111111111111111111111111111111112'
+        def _convert_amount_with_decimals(raw_amount: Any, mint_addr: Optional[str]) -> float:
+            if raw_amount is None:
+                return 0.0
+            try:
+                amount_float = float(raw_amount)
+            except (TypeError, ValueError):
+                return 0.0
+            decimals_value = None
+            if mint_addr == SOL_MINT_ADDRESS:
+                decimals_value = QUOTE_TOKEN_DECIMALS.get(SOL_MINT_ADDRESS, 9)
+            elif mint_addr:
+                token_info = all_token_data.get(mint_addr) or main_token_data.get(mint_addr)
+                if token_info:
+                    decimals_value = token_info.get('decimals')
+            if decimals_value is None:
+                return amount_float
+            try:
+                decimals_int = max(int(decimals_value), 0)
+            except (TypeError, ValueError):
+                decimals_int = 0
+            if decimals_int <= 0:
+                return amount_float
+            if mint_addr == SOL_MINT_ADDRESS:
+                should_scale = abs(amount_float) >= 1e5
+            else:
+                should_scale = abs(amount_float) >= (10 ** decimals_int)
+            return amount_float / (10 ** decimals_int) if should_scale else amount_float
+        pool_created_events = []
+        for pool_record in pool_creation_records:
+            pool_ts_value = _timestamp_to_order_value(pool_record.get('timestamp'))
+            pool_timestamp_int = int(pool_ts_value)
+            quote_token_address = pool_record.get('quote_address')
+            base_liquidity_raw = pool_record.get('initial_base_liquidity')
+            base_decimals_override = pool_record.get('base_decimals')
+            if base_decimals_override is None:
+                base_decimals_override = main_token_info.get('decimals', base_decimals)
+            base_decimals_value = int(base_decimals_override) if base_decimals_override is not None else int(base_decimals)
+            base_amount_dec = _convert_amount_with_decimals(base_liquidity_raw, token_address)
+            quote_liquidity_raw = pool_record.get('initial_quote_liquidity')
+            quote_decimals_override = pool_record.get('quote_decimals')
+            if quote_decimals_override is None:
+                quote_token_info = main_token_data.get(quote_token_address, {})
+                quote_decimals_override = quote_token_info.get('decimals', QUOTE_TOKEN_DECIMALS.get(quote_token_address, 9))
+            if quote_decimals_override is None:
+                quote_decimals_override = 9
+            quote_decimals_value = int(quote_decimals_override)
+            quote_amount_dec = _convert_amount_with_decimals(quote_liquidity_raw, quote_token_address)
+            protocol_raw = pool_record.get('protocol')
+            protocol_id = protocol_raw if isinstance(protocol_raw, int) and 0 <= protocol_raw < vocab.NUM_PROTOCOLS else vocab.PROTOCOL_TO_ID.get('Unknown', 0)
+            pool_event = {
+                'event_type': 'PoolCreated',
+                'timestamp': pool_timestamp_int,
+                'relative_ts': pool_ts_value - t0_timestamp,
+                'wallet_address': pool_record.get('creator_address'),
+                'token_address': token_address,
+                'protocol_id': protocol_id,
+                'quote_token_address': quote_token_address,
+                'base_amount': base_amount_dec,
+                'quote_amount': quote_amount_dec,
+                'priority_fee': pool_record.get('priority_fee', 0.0),
+            }
+            pool_created_events.append(pool_event)
+            pool_sort_key = _event_execution_sort_key(
+                pool_ts_value,
+                slot=pool_record.get('slot'),
+                transaction_index=0,
+                instruction_index=0,
+                signature=pool_record.get('signature', '')
+            )
+            _register_event(pool_event, pool_sort_key)
+        # --- NEW: Convert liquidity change records into structured events ---
+        liquidity_change_events = []
+        for liquidity_record in liquidity_change_records:
+            pool_address = liquidity_record.get('pool_address')
+            pool_meta = pool_metadata_by_address.get(pool_address, {})
+            quote_token_address = pool_meta.get('quote_token_address')
+            quote_decimals_override = pool_meta.get('quote_decimals')
+            if quote_decimals_override is None:
+                quote_token_info = main_token_data.get(quote_token_address, {})
+                quote_decimals_override = quote_token_info.get('decimals', QUOTE_TOKEN_DECIMALS.get(quote_token_address, 9))
+            if quote_decimals_override is None:
+                quote_decimals_override = 9
+            quote_amount_raw = liquidity_record.get('quote_amount', 0)
+            quote_decimals_value = int(quote_decimals_override)
+            quote_amount_dec = _convert_amount_with_decimals(quote_amount_raw, quote_token_address)
+            liquidity_ts_value = _timestamp_to_order_value(liquidity_record.get('timestamp'))
+            liquidity_timestamp_int = int(liquidity_ts_value)
+            protocol_raw = liquidity_record.get('protocol')
+            protocol_id = protocol_raw if isinstance(protocol_raw, int) and 0 <= protocol_raw < vocab.NUM_PROTOCOLS else vocab.PROTOCOL_TO_ID.get('Unknown', 0)
+            change_type_id = int(liquidity_record.get('change_type', 0) or 0)
+            liquidity_event = {
+                'event_type': 'LiquidityChange',
+                'timestamp': liquidity_timestamp_int,
+                'relative_ts': liquidity_ts_value - t0_timestamp,
+                'wallet_address': liquidity_record.get('lp_provider'),
+                'token_address': token_address,
+                'protocol_id': protocol_id,
+                'quote_token_address': quote_token_address,
+                'change_type_id': change_type_id,
+                'quote_amount': quote_amount_dec,
+                'priority_fee': liquidity_record.get('priority_fee', 0.0),
+                'success': liquidity_record.get('success', False)
+            }
+            if quote_token_address:
+                liquidity_change_events.append(liquidity_event)
+                liquidity_sort_key = _event_execution_sort_key(
+                    liquidity_ts_value,
+                    slot=liquidity_record.get('slot'),
+                    transaction_index=0,
+                    instruction_index=0,
+                    signature=liquidity_record.get('signature', '')
+                )
+                _register_event(liquidity_event, liquidity_sort_key)
+        # --- NEW: Convert fee collection records into structured events ---
+        fee_collected_events = []
+        for fee_record in fee_collection_records:
+            fee_ts_value = _timestamp_to_order_value(fee_record.get('timestamp'))
+            fee_timestamp_int = int(fee_ts_value)
+            token0_mint = fee_record.get('token_0_mint_address')
+            token1_mint = fee_record.get('token_1_mint_address')
+            token0_amount_raw = fee_record.get('token_0_amount')
+            token1_amount_raw = fee_record.get('token_1_amount')
+            sol_amount = 0.0
+            if token0_mint == SOL_MINT_ADDRESS:
+                sol_amount = _convert_amount_with_decimals(token0_amount_raw, SOL_MINT_ADDRESS)
+            elif token1_mint == SOL_MINT_ADDRESS:
+                sol_amount = _convert_amount_with_decimals(token1_amount_raw, SOL_MINT_ADDRESS)
+            # Skip if both amounts are zero and no meaningful wallet
+            recipient_addr = fee_record.get('recipient_address')
+            if not recipient_addr:
+                continue
+            fee_event = {
+                'event_type': 'FeeCollected',
+                'timestamp': fee_timestamp_int,
+                'relative_ts': fee_ts_value - t0_timestamp,
+                'wallet_address': recipient_addr,
+                'token_address': token_address,
+                'sol_amount': sol_amount,
+                'priority_fee': fee_record.get('priority_fee', 0.0),
+                'protocol_id': fee_record.get('protocol', 0),
+                'success': fee_record.get('success', False),
+            }
+            fee_collected_events.append(fee_event)
+            fee_sort_key = _event_execution_sort_key(
+                fee_ts_value,
+                slot=fee_record.get('slot'),
+                transaction_index=0,
+                instruction_index=0,
+                signature=fee_record.get('signature', '')
+            )
+            _register_event(fee_event, fee_sort_key)
+        # --- NEW: Convert burn records into structured TokenBurn events ---
+        token_burn_events = []
+        for burn in burn_records:
+            burn_ts_value = _timestamp_to_order_value(burn.get('timestamp'))
+            burn_timestamp_int = int(burn_ts_value)
+            amount_dec = burn.get('amount_decimal')
+            if amount_dec is None:
+                raw_amount = burn.get('amount', 0)
+                try:
+                    raw_amount = float(raw_amount)
+                except (TypeError, ValueError):
+                    raw_amount = 0.0
+                amount_dec = raw_amount / (10**base_decimals) if base_decimals and base_decimals > 0 else raw_amount
+            pct_of_supply = (amount_dec / total_supply_dec) if total_supply_dec and total_supply_dec > 0 else 0.0
+            burn_event = {
+                'event_type': 'TokenBurn',
+                'timestamp': burn_timestamp_int,
+                'relative_ts': burn_ts_value - t0_timestamp,
+                'wallet_address': burn.get('source'),
+                'token_address': token_address,
+                'amount_pct_of_total_supply': pct_of_supply,
+                'amount_tokens_burned': amount_dec,
+                'priority_fee': burn.get('priority_fee', 0.0),
+                'success': burn.get('success', False),
+            }
+            token_burn_events.append(burn_event)
+            burn_sort_key = _event_execution_sort_key(
+                burn_ts_value,
+                slot=burn.get('slot'),
+                transaction_index=0,
+                instruction_index=0,
+                signature=burn.get('signature', '')
+            )
+            _register_event(burn_event, burn_sort_key)
+        # --- NEW: Convert migrations into Migrated events ---
+        for mig in migration_records:
+            mig_ts_value = _timestamp_to_order_value(mig.get('timestamp'))
+            mig_timestamp_int = int(mig_ts_value)
+            prot_raw = mig.get('protocol', 0)
+            protocol_id = prot_raw if isinstance(prot_raw, int) and 0 <= prot_raw < vocab.NUM_PROTOCOLS else vocab.PROTOCOL_TO_ID.get('Unknown', 0)
+            mig_event = {
+                'event_type': 'Migrated',
+                'timestamp': mig_timestamp_int,
+                'relative_ts': mig_ts_value - t0_timestamp,
+                'protocol_id': protocol_id,
+            }
+            mig_sort_key = _event_execution_sort_key(
+                mig_ts_value,
+                slot=mig.get('slot'),
+                transaction_index=0,
+                instruction_index=0,
+                signature=mig.get('signature', '')
+            )
+            _register_event(mig_event, mig_sort_key)
+        # NOTE: HolderSnapshot events are generated per-snapshot time inside _generate_onchain_snapshots
+        # --- NEW: Convert supply lock records into structured SupplyLock events ---
+        supply_lock_events = []
+        for lock in supply_lock_records:
+            lock_ts_value = _timestamp_to_order_value(lock.get('timestamp'))
+            lock_timestamp_int = int(lock_ts_value)
+            # total_locked_amount is Float64, typically already decimal-scaled
+            raw_locked = lock.get('total_locked_amount', 0.0)
+            try:
+                locked_amount = float(raw_locked)
+            except (TypeError, ValueError):
+                locked_amount = 0.0
+            pct_of_supply = (locked_amount / total_supply_dec) if total_supply_dec and total_supply_dec > 0 else 0.0
+            final_unlock_ts = lock.get('final_unlock_timestamp') or 0
+            try:
+                final_unlock_ts = int(final_unlock_ts)
+            except (TypeError, ValueError):
+                final_unlock_ts = 0
+            lock_duration = max(0, final_unlock_ts - lock_timestamp_int)
+            lock_event = {
+                'event_type': 'SupplyLock',
+                'timestamp': lock_timestamp_int,
+                'relative_ts': lock_ts_value - t0_timestamp,
+                'wallet_address': lock.get('sender'),
+                'token_address': token_address,
+                'amount_pct_of_total_supply': pct_of_supply,
+                'lock_duration': float(lock_duration),
+                'priority_fee': lock.get('priority_fee', 0.0),
+                'success': lock.get('success', False),
+            }
+            supply_lock_events.append(lock_event)
+            lock_sort_key = _event_execution_sort_key(
+                lock_ts_value,
+                slot=lock.get('slot'),
+                transaction_index=0,
+                instruction_index=0,
+                signature=lock.get('signature', '')
+            )
+            _register_event(lock_event, lock_sort_key)
+        # --- NEW: Process transfer events with strict validation ---
+        transfer_events = []
+        for transfer in transfer_records:
+            print("BOMBOCLAT TRANSFER", transfer)
+            # --- VALIDATION: Ensure the destination wallet has a valid profile ---
+            if transfer['destination'] not in wallet_data:
+                print(f"INFO: Skipping transfer event {transfer['signature']} because destination wallet {transfer['destination']} has no profile.")
+                continue
+            # Calculate features
+            token_amount = transfer.get('amount_decimal', 0.0)
+            pct_of_supply = (token_amount / total_supply_dec) if total_supply_dec > 0 else 0.0
+            # Reconstruct pre-transfer balance of the source wallet
+            pre_transfer_source_balance = transfer.get('source_balance', 0.0) + token_amount
+            pct_of_holding = (token_amount / pre_transfer_source_balance) if pre_transfer_source_balance > 1e-9 else 1.0
+            # --- NEW: Classify LargeTransfer based on supply percentage ---
+            if pct_of_supply > LARGE_TRANSFER_SUPPLY_PCT_THRESHOLD:
+                event_type = 'LargeTransfer'
+            else:
+                event_type = 'Transfer'
+            transfer_ts_value = _timestamp_to_order_value(transfer.get('timestamp'))
+            transfer_event = {
+                'event_type': event_type,
+                'timestamp': int(transfer_ts_value),
+                'relative_ts': transfer_ts_value - t0_timestamp,
+                'wallet_address': transfer['source'],
+                'destination_wallet_address': transfer['destination'],
+                'token_address': token_address,
+                'token_amount': token_amount,
+                'transfer_pct_of_total_supply': pct_of_supply,
+                'transfer_pct_of_holding': pct_of_holding,
+                'priority_fee': transfer.get('priority_fee', 0.0)
+            }
+            transfer_events.append(transfer_event)
+            transfer_sort_key = _event_execution_sort_key(
+                transfer_ts_value,
+                slot=transfer.get('slot'),
+                transaction_index=transfer.get('transaction_index'),
+                instruction_index=transfer.get('instruction_index'),
+                signature=transfer.get('signature', '')
+            )
+            _register_event(transfer_event, transfer_sort_key)
+        # --- NEW: Correctly detect bundles with a single pass after event creation ---
+        # trade_records are ordered by (timestamp, slot, transaction_index, instruction_index),
+        # so adjacent entries that share a slot belong to the same bundle.
+        if len(trade_records) > 1:
+            for i in range(1, len(trade_records)):
+                if trade_records[i]['slot'] == trade_records[i-1]['slot']:
+                    # The corresponding events are at the same indices in trade_events
+                    trade_events[i]['is_bundle'] = True
+                    trade_events[i-1]['is_bundle'] = True
+        # Generate OnChain_Snapshot events using helper
+        self._generate_onchain_snapshots(
+            token_address=token_address,
+            t0_timestamp=t0_timestamp,
+            T_cutoff=T_cutoff,
+            interval_sec=HOLDER_SNAPSHOT_INTERVAL_SEC,
+            trade_events=trade_events,
+            transfer_events=transfer_events,
+            aggregation_trades=aggregation_trades,
+            wallet_data=wallet_data,
+            total_supply_dec=total_supply_dec,
+            _register_event_fn=_register_event
+        )
+        # 7. TODO: Fetch social events (tweets, replies, etc.) for all discovered wallets
+        #    - Query tables like 'x_posts', 'pump_replies'.
+        #    - Use the pooler to get indices for text and media.
+        # Sort the combined event sequence by precise execution order
+        event_sequence_entries.sort(key=lambda entry: entry[0])
+        event_sequence = [event for _, event in event_sequence_entries]
+        anchor_timestamp_int = int(_timestamp_to_order_value(T_cutoff))
+        anchor_price = None
+        if aggregation_trades:
+            for trade in reversed(aggregation_trades):
+                price_val = trade.get('price_usd')
+                if price_val is not None:
+                    anchor_price = float(price_val)
+                    break
+        if self.num_outputs > 0 and anchor_price is None:
+            print(f"INFO: Skipping token {token_address} (no pre-cutoff price for labeling).")
+            return None
+        future_price_series: List[Tuple[int, float]] = []
+        if (self.num_outputs > 0 and max_horizon_seconds > 0 and
+                anchor_price is not None):
+            timeline = [(anchor_timestamp_int, anchor_price)]
+            for trade in future_trades_for_labels:
+                price_val = trade.get('price_usd')
+                if price_val is None:
+                    continue
+                ts_int = int(_timestamp_to_order_value(trade.get('timestamp')))
+                if ts_int <= timeline[-1][0]:
+                    continue
+                timeline.append((ts_int, float(price_val)))
+            if len(timeline) > 1:
+                future_price_series = timeline
+        debug_label_entries: List[Dict[str, Any]] = []
+        if self.num_outputs > 0:
+            labels_tensor, labels_mask_tensor, debug_label_entries = self._compute_future_return_labels(
+                anchor_price, anchor_timestamp_int, future_price_series
+            )
+            if labels_mask_tensor.sum() == 0:
+                print(f"INFO: Skipping token {token_address} (no valid horizons in future).")
+                return None
+            print("\n[Label Debug]")
+            for entry in debug_label_entries:
+                print(f"  Horizon {entry['horizon']}s -> target_ts={entry['target_ts']}, "
+                      f"future_price={entry['future_price']}, return={entry['return']:.6f}, "
+                      f"mask={int(entry['mask'])}")
+        else:
+            labels_tensor = torch.zeros(0)
+            labels_mask_tensor = torch.zeros(0)
+        # For now, we'll return the item with mint and trade events
+        item = {
+            'event_sequence': event_sequence,
+            'wallets': wallet_data,
+            'tokens': all_token_data, # FIXED: Use the comprehensive token data
+            'graph_links': graph_links, # NEW: Add the fetched graph links
+            'embedding_pooler': pooler,
+            'labels': labels_tensor,
+            'labels_mask': labels_mask_tensor}
+        # --- NEW: Comprehensive logging before returning the item ---
+        print("\n--- Dataset Item Generation Summary ---")
+        print(f"Token Address: {token_address}"
+              )
+        print(f"\n[Event Sequence] ({len(item['event_sequence'])} events):")
+        for i, event in enumerate(item['event_sequence']):
+            print(f"  - Event {i}: {event}")
+        print(f"\n[Wallets] ({len(item['wallets'])} wallets):")
+        for i, (addr, data) in enumerate(item['wallets'].items()):
+            print(f"  - Wallet {addr}:")
+            print(f"    - Profile: {data.get('profile', {})}")
+            print(f"    - Socials: {data.get('socials', {})}")
+        print(f"\n[Tokens] ({len(item['tokens'])} tokens):")
+        for addr, data in item['tokens'].items():
+            print(f"  - Token {addr}: {data}")
+        if self.num_outputs > 0:
+            print(f"\n[Labels]")
+            for h_idx, horizon in enumerate(self.horizons_seconds):
+                offset = h_idx * len(self.quantiles)
+                values = item['labels'][offset:offset + len(self.quantiles)]
+                masks = item['labels_mask'][offset:offset + len(self.quantiles)]
+                print(f"  Horizon {horizon}s:")
+                for q_idx, quantile in enumerate(self.quantiles):
+                    print(f"    q={quantile:.2f}: value={values[q_idx]:.6f}, mask={masks[q_idx]:.0f}")
+        print("--- End Summary ---\n")
+        return item

data/ohlc_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f39f15281440244b927a46d14a85537afd891163556d46ee3a79c80c25b6f36b
+size 1660

data/preprocess_distribution.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python3
+"""
+Preprocess distribution statistics for OHLC normalization and token history coverage.
+This script:
+1. Computes global mean/std figures for price/volume so downstream code can normalize.
+2. Prints descriptive stats about how much price history (in seconds) each token has,
+   helping decide which horizons are realistic.
+All configuration is done via environment variables (see below).
+"""
+import os
+import pathlib
+import sys
+from typing import List
+import numpy as np
+import clickhouse_connect
+# --- Configuration (override via env vars if needed) ---
+CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
+CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", "8123"))
+CLICKHOUSE_USERNAME = os.getenv("CLICKHOUSE_USERNAME", "default")
+CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
+CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
+OUTPUT_PATH = pathlib.Path(os.getenv("OHLC_STATS_PATH", "ohlc_stats.npz"))
+MIN_PRICE_USD = float(os.getenv("OHLC_MIN_PRICE_USD", "0.0"))
+MIN_VOLUME_USD = float(os.getenv("OHLC_MIN_VOLUME_USD", "0.0"))
+TOKEN_ADDRESSES_ENV = os.getenv("OHLC_TOKEN_ADDRESSES", "")
+TOKEN_ADDRESSES = tuple(addr.strip() for addr in TOKEN_ADDRESSES_ENV.split(",") if addr.strip()) or None
+def build_where_clause() -> List[str]:
+    clauses = ["t.price_usd > %(min_price)s", "t.total_usd > %(min_vol)s"]
+    if TOKEN_ADDRESSES:
+        clauses.append("t.base_address IN %(token_addresses)s")
+    return clauses
+def build_stats_query(where_sql: str) -> str:
+    return f"""
+        SELECT
+            AVG(t.price_usd)         AS mean_price_usd,
+            stddevPop(t.price_usd)   AS std_price_usd,
+            AVG(t.price)             AS mean_price_native,
+            stddevPop(t.price)       AS std_price_native,
+            AVG(t.total_usd)         AS mean_trade_value_usd,
+            stddevPop(t.total_usd)   AS std_trade_value_usd
+        FROM trades AS t
+        INNER JOIN mints AS m
+            ON m.mint_address = t.base_address
+        WHERE {where_sql}
+    """
+def build_history_query(where_sql: str) -> str:
+    return f"""
+        SELECT
+            t.base_address AS token_address,
+            toUnixTimestamp(min(t.timestamp)) AS first_ts,
+            toUnixTimestamp(max(t.timestamp)) AS last_ts,
+            toUnixTimestamp(max(t.timestamp)) - toUnixTimestamp(min(t.timestamp)) AS history_seconds
+        FROM trades AS t
+        INNER JOIN mints AS m
+            ON m.mint_address = t.base_address
+        WHERE {where_sql}
+        GROUP BY token_address
+    """
+def summarize_histories(histories: np.ndarray) -> None:
+    if histories.size == 0:
+        print("No token history stats available (no qualifying trades).")
+        return
+    stats = {
+        "count": histories.size,
+        "min": histories.min(),
+        "median": float(np.median(histories)),
+        "mean": histories.mean(),
+        "p90": float(np.percentile(histories, 90)),
+        "max": histories.max(),
+    }
+    def format_seconds(sec: float) -> str:
+        hours = sec / 3600.0
+        days = hours / 24.0
+        return f"{sec:.0f}s ({hours:.2f}h / {days:.2f}d)"
+    print("\nToken history coverage (seconds):")
+    print(f"  Tokens analyzed: {int(stats['count'])}")
+    print(f"  Min history:     {format_seconds(stats['min'])}")
+    print(f"  Median history:  {format_seconds(stats['median'])}")
+    print(f"  Mean history:    {format_seconds(stats['mean'])}")
+    print(f"  90th percentile: {format_seconds(stats['p90'])}")
+    print(f"  Max history:     {format_seconds(stats['max'])}")
+def main() -> int:
+    where_clauses = build_where_clause()
+    where_sql = " AND ".join(where_clauses) if where_clauses else "1"
+    params: dict[str, object] = {
+        "min_price": max(MIN_PRICE_USD, 0.0),
+        "min_vol": max(MIN_VOLUME_USD, 0.0),
+    }
+    if TOKEN_ADDRESSES:
+        params["token_addresses"] = TOKEN_ADDRESSES
+    client = clickhouse_connect.get_client(
+        host=CLICKHOUSE_HOST,
+        port=CLICKHOUSE_PORT,
+        username=CLICKHOUSE_USERNAME,
+        password=CLICKHOUSE_PASSWORD,
+        database=CLICKHOUSE_DATABASE,
+    )
+    # --- Price/volume stats ---
+    stats_query = build_stats_query(where_sql)
+    stats_result = client.query(stats_query, parameters=params)
+    if not stats_result.result_rows:
+        print("ERROR: Stats query returned no rows. Check filters / connectivity.", file=sys.stderr)
+        return 1
+    (
+        mean_price_usd,
+        std_price_usd,
+        mean_price_native,
+        std_price_native,
+        mean_trade_value_usd,
+        std_trade_value_usd,
+    ) = stats_result.result_rows[0]
+    stats = {
+        "mean_price_usd": float(mean_price_usd or 0.0),
+        "std_price_usd": float(std_price_usd or 1.0),
+        "mean_price_native": float(mean_price_native or 0.0),
+        "std_price_native": float(std_price_native or 1.0),
+        "mean_trade_value_usd": float(mean_trade_value_usd or 0.0),
+        "std_trade_value_usd": float(std_trade_value_usd or 1.0),
+    }
+    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    np.savez(OUTPUT_PATH, **stats)
+    print(f"Saved stats to {OUTPUT_PATH.resolve()}:")
+    for key, value in stats.items():
+        print(f"  {key}: {value:.6f}")
+    # --- Token history coverage ---
+    history_query = build_history_query(where_sql)
+    history_result = client.query(history_query, parameters=params)
+    history_seconds = np.array(
+        [float(row[3]) for row in history_result.result_rows if row[3] is not None],
+        dtype=np.float64
+    )
+    summarize_histories(history_seconds)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

graph_schema.rs ADDED Viewed

	@@ -0,0 +1,115 @@

+/// Tracks direct capital flow and identifies funding chains.
+pub struct TransferLink {
+    pub signature: String,
+    pub source: String,
+    pub destination: String,
+    pub mint: String,
+    pub timestamp: i64,
+}
+/// Identifies wallets trading the same token in the same slot.
+pub struct BundleTradeLink {
+    pub signatures: Vec<String>,
+    pub wallet_a: String,
+    pub wallet_b: String,
+    pub mint: String,
+    pub slot: i64,
+    pub timestamp: i64,
+}
+/// Reveals a behavioral pattern of one wallet mirroring another's successful trade.
+pub struct CopiedTradeLink {
+    pub timestamp: i64,
+    pub leader_buy_sig: String,
+    pub leader_sell_sig: String,
+    pub follower_buy_sig: String,
+    pub follower_sell_sig: String,
+    pub follower: String,
+    pub leader: String,
+    pub mint: String,
+    pub time_gap_on_buy_sec: i64,
+    pub time_gap_on_sell_sec: i64,
+    pub leader_pnl: f64,
+    pub follower_pnl: f64,
+    pub leader_buy_total: f64,
+    pub leader_sell_total: f64,
+    pub follower_buy_total: f64,
+    pub follower_sell_total: f64,
+    pub follower_buy_slippage: f32,
+    pub follower_sell_slippage: f32,
+}
+/// Represents a link where a group of wallets re-engage with a token in a coordinated manner.
+pub struct CoordinatedActivityLink {
+    pub timestamp: i64,
+    pub leader_first_sig: String,
+    pub leader_second_sig: String,
+    pub follower_first_sig: String,
+    pub follower_second_sig: String,
+    pub follower: String,
+    pub leader: String,
+    pub mint: String,
+    pub time_gap_on_first_sec: i64,
+    pub time_gap_on_second_sec: i64,
+}
+/// Links a token to its original creator.
+pub struct MintedLink {
+    pub signature: String,
+    pub timestamp: i64,
+    pub buy_amount: f64,
+}
+/// Connects a token to its successful first-movers.
+pub struct SnipedLink {
+    pub timestamp: i64,
+    pub signature: String,
+    pub rank: i64,
+    pub sniped_amount: f64,
+}
+/// Represents connection between wallet that locked supply.
+pub struct LockedSupplyLink {
+    pub timestamp: i64,
+    pub signature: String,
+    pub amount: f64,
+    pub unlock_timestamp: u64,
+}
+/// link of the   wallet that burned tokens.
+pub struct BurnedLink {
+    pub signature: String,
+    pub amount: f64,
+    pub timestamp: i64,
+}
+/// Identifies wallets that provided liquidity, signaling high conviction.
+pub struct ProvidedLiquidityLink {
+    pub signature: String,
+    pub wallet: String,
+    pub token: String,
+    pub pool_address: String,
+    pub amount_base: f64,
+    pub amount_quote: f64,
+    pub timestamp: i64,
+}
+/// A derived link connecting a token to its largest holders.
+pub struct WhaleOfLink {
+    pub timestamp: i64,
+    pub wallet: String,
+    pub token: String,
+    pub holding_pct_at_creation: f32, // Holding % when the link was made
+    pub ath_usd_at_creation: f64,     // Token's ATH when the link was made
+}
+/// A derived link connecting a token to its most profitable traders.
+pub struct TopTraderOfLink {
+    pub timestamp: i64,
+    pub wallet: String,
+    pub token: String,
+    pub pnl_at_creation: f64,     // The PNL that first triggered the link
+    pub ath_usd_at_creation: f64, // Token's ATH when the link was made
+}

inference.py ADDED Viewed

	@@ -0,0 +1,271 @@

+# inference.py
+import torch
+import traceback
+import time
+# Import all the necessary components from our project
+from models.model import Oracle
+from data.data_collator import MemecoinCollator
+from models.multi_modal_processor import MultiModalEncoder
+from data.data_loader import OracleDataset
+from data.data_fetcher import DataFetcher
+from models.helper_encoders import ContextualTimeEncoder
+from models.token_encoder import TokenEncoder
+from models.wallet_encoder import WalletEncoder
+from models.graph_updater import GraphUpdater
+from models.ohlc_embedder import OHLCEmbedder
+import models.vocabulary as vocab
+# --- NEW: Import database clients ---
+from clickhouse_driver import Client as ClickHouseClient
+from neo4j import GraphDatabase
+# =============================================================================
+# Inference/Test Script for the Oracle Model
+# This script replicates the test logic previously in model.py
+# =============================================================================
+if __name__ == "__main__":
+    print("--- Oracle Inference Script (Full Pipeline Test) ---")
+    # --- 1. Define Configs ---
+    OHLC_SEQ_LEN = 60
+    print(f"Using {vocab.NUM_EVENT_TYPES} event types from vocabulary.")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
+    if device.type == 'cpu': dtype = torch.float32
+    print(f"Using device: {device}, dtype: {dtype}")
+    _test_quantiles = [0.1, 0.5, 0.9]
+    _test_horizons = [30, 60, 120, 240, 420]
+    _test_num_outputs = len(_test_quantiles) * len(_test_horizons)
+    # --- 2. Instantiate ALL Encoders ---
+    print("Instantiating encoders (using defaults)...")
+    try:
+        multi_modal_encoder = MultiModalEncoder(dtype=dtype)
+        real_time_enc = ContextualTimeEncoder(dtype=dtype)
+        real_token_enc = TokenEncoder(
+            multi_dim=multi_modal_encoder.embedding_dim,
+            dtype=dtype
+        )
+        real_wallet_enc = WalletEncoder(encoder=multi_modal_encoder, dtype=dtype)
+        real_graph_upd = GraphUpdater(time_encoder=real_time_enc, dtype=dtype)
+        real_ohlc_emb = OHLCEmbedder(
+            num_intervals=vocab.NUM_OHLC_INTERVALS,
+            sequence_length=OHLC_SEQ_LEN,
+            dtype=dtype
+        )
+        print(f"TokenEncoder default output_dim: {real_token_enc.output_dim}")
+        print(f"WalletEncoder default d_model: {real_wallet_enc.d_model}")
+        print(f"OHLCEmbedder default output_dim: {real_ohlc_emb.output_dim}")
+        print("Encoders instantiated.")
+    except Exception as e:
+        print(f"Failed to instantiate encoders: {e}")
+        traceback.print_exc()
+        exit()
+    # --- 3. Instantiate the Collator ---
+    collator = MemecoinCollator(
+        event_type_to_id=vocab.EVENT_TO_ID,
+        device=device,
+        multi_modal_encoder=multi_modal_encoder,
+        dtype=dtype,
+        ohlc_seq_len=OHLC_SEQ_LEN,
+        max_seq_len=50
+    )
+    print("MemecoinCollator (fast batcher) instantiated.")
+    # --- 4. Instantiate the Oracle Model ---
+    print("Instantiating Oracle (full pipeline)...")
+    model = Oracle(
+        token_encoder=real_token_enc,
+        wallet_encoder=real_wallet_enc,
+        graph_updater=real_graph_upd,
+        time_encoder=real_time_enc,
+        multi_modal_dim=multi_modal_encoder.embedding_dim,
+        num_event_types=vocab.NUM_EVENT_TYPES,
+        event_pad_id=vocab.EVENT_TO_ID['__PAD__'],
+        event_type_to_id=vocab.EVENT_TO_ID,
+        model_config_name="Qwen/Qwen3-0.6B",
+        quantiles=_test_quantiles,
+        horizons_seconds=_test_horizons,
+        dtype=dtype,
+        ohlc_embedder=real_ohlc_emb
+    ).to(device)
+    model.eval()
+    print(f"Oracle d_model: {model.d_model}")
+    # --- 5. Create Dataset and run pre-collation step ---
+    print("Creating Dataset...")
+    # --- NEW: Initialize real database clients and DataFetcher ---
+    try:
+        print("Connecting to databases...")
+        # ClickHouse running locally on port 8123 with no auth
+        clickhouse_client = ClickHouseClient(host='localhost', port=9000)
+        # Neo4j running locally on port 7687 with no auth
+        neo4j_driver = GraphDatabase.driver("bolt://localhost:7687", auth=None)
+        data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+        print("Database clients and DataFetcher initialized.")
+        # --- Fetch mints to get the first token for processing ---
+        all_mints = data_fetcher.get_all_mints()
+        if not all_mints:
+            print("\n❌ No mints found in the database. Exiting test.")
+            exit()
+        # --- FIXED: Instantiate the dataset in REAL mode, removing is_test flag ---
+        dataset = OracleDataset(
+            data_fetcher=data_fetcher,
+            horizons_seconds=_test_horizons,
+            quantiles=_test_quantiles,
+            max_samples=57)
+    except Exception as e:
+        print(f"FATAL: Could not initialize database connections or dataset: {e}")
+        traceback.print_exc()
+        exit()
+    # --- PRODUCTION-READY: Process a full batch of items from the dataset ---
+    print(f"\n--- Processing a batch of up to {len(dataset)} items from the dataset ---")
+    batch_items = []
+    for i in range(len(dataset)):
+        token_addr = dataset.sampled_mints[i].get('mint_address', 'unknown')
+        print(f"  - Attempting to process sample {i+1}/{len(dataset)} ({token_addr})...")
+        fetch_start = time.time()
+        sample = dataset[i]
+        fetch_elapsed = time.time() - fetch_start
+        print(f"    ... fetch completed in {fetch_elapsed:.2f}s")
+        if sample is not None:
+            batch_items.append(sample)
+            print(f"    ... Success! Sample added to batch.")
+    if not batch_items:
+        print("\n❌ No valid samples could be generated from the dataset. Exiting.")
+        exit()
+    # --- 6. Run Collator AND Model ---
+    print("\n--- Testing Pipeline (Collator + Model.forward) ---")
+    try:
+        # 1. Collator
+        collate_start = time.time()
+        collated_batch = collator(batch_items)
+        collate_elapsed = time.time() - collate_start
+        print("Collation successful!")
+        print(f"Collation time for batch of {len(batch_items)} tokens: {collate_elapsed:.2f}s")
+        # --- Check collator output ---
+        B = len(batch_items)
+        L = collated_batch['attention_mask'].shape[1]
+        assert 'ohlc_price_tensors' in collated_batch
+        ohlc_price_tensors = collated_batch['ohlc_price_tensors']
+        assert ohlc_price_tensors.dim() == 3, f"Expected 3D OHLC tensor, got shape {tuple(ohlc_price_tensors.shape)}"
+        assert ohlc_price_tensors.shape[1] == 2, f"Expected OHLC tensor with 2 rows (open/close), got {ohlc_price_tensors.shape[1]}"
+        assert ohlc_price_tensors.shape[2] == OHLC_SEQ_LEN, f"Expected OHLC seq len {OHLC_SEQ_LEN}, got {ohlc_price_tensors.shape[2]}"
+        assert collated_batch['ohlc_interval_ids'].shape[0] == ohlc_price_tensors.shape[0], "Interval ids must align with OHLC segments"
+        assert ohlc_price_tensors.dtype == dtype, f"OHLC tensor dtype {ohlc_price_tensors.dtype} != expected {dtype}"
+        print(f"Collator produced {ohlc_price_tensors.shape[0]} OHLC segment(s).")
+        # --- FIXED: Update assertions for event-specific data which is mostly empty for now ---
+        assert collated_batch['dest_wallet_indices'].shape == (B, L)
+        assert collated_batch['transfer_numerical_features'].shape == (B, L, 4)
+        assert collated_batch['trade_numerical_features'].shape == (B, L, 8) # Corrected from 10
+        assert collated_batch['deployer_trade_numerical_features'].shape == (B, L, 8) # Corrected from 10
+        assert collated_batch['smart_wallet_trade_numerical_features'].shape == (B, L, 8) # Corrected from 10
+        assert collated_batch['pool_created_numerical_features'].shape == (B, L, 2)
+        assert collated_batch['liquidity_change_numerical_features'].shape == (B, L, 1)
+        assert collated_batch['fee_collected_numerical_features'].shape == (B, L, 1)
+        assert collated_batch['token_burn_numerical_features'].shape == (B, L, 2)
+        assert collated_batch['supply_lock_numerical_features'].shape == (B, L, 2)
+        assert collated_batch['onchain_snapshot_numerical_features'].shape == (B, L, 14)
+        assert collated_batch['trending_token_numerical_features'].shape == (B, L, 1)
+        assert collated_batch['boosted_token_numerical_features'].shape == (B, L, 2)
+        # assert len(collated_batch['holder_snapshot_raw_data']) == 1 # No holder snapshots yet
+        # assert len(collated_batch['textual_event_data']) == 8 # No textual events yet
+        assert collated_batch['dexboost_paid_numerical_features'].shape == (B, L, 2)
+        print("Collator correctly processed all event-specific numerical data into their respective tensors.")
+        # --- NEW: Comprehensive Debugging Output ---
+        print("\n--- Collated Batch Debug Output ---")
+        print(f"Batch Size: {B}, Max Sequence Length: {L}")
+        # Print shapes of key tensors
+        print("\n[Core Tensors]")
+        print(f"  event_type_ids: {collated_batch['event_type_ids'].shape}")
+        print(f"  attention_mask: {collated_batch['attention_mask'].shape}")
+        print(f"  timestamps_float: {collated_batch['timestamps_float'].shape}")
+        print("\n[Pointer Tensors]")
+        print(f"  wallet_indices: {collated_batch['wallet_indices'].shape}")
+        print(f"  token_indices: {collated_batch['token_indices'].shape}")
+        print("\n[Encoder Inputs]")
+        print(f"  embedding_pool: {collated_batch['embedding_pool'].shape}")
+        # --- FIXED: Check for a key that still exists after removing address embeddings ---
+        if collated_batch['token_encoder_inputs']['name_embed_indices'].numel() > 0:
+            print(f"  token_encoder_inputs contains {collated_batch['token_encoder_inputs']['name_embed_indices'].shape[0]} tokens.")
+        else:
+            print("  token_encoder_inputs is empty.")
+        if collated_batch['wallet_encoder_inputs']['profile_rows']:
+            print(f"  wallet_encoder_inputs contains {len(collated_batch['wallet_encoder_inputs']['profile_rows'])} wallets.")
+        else:
+            print("  wallet_encoder_inputs is empty.")
+        print("\n[Graph Links]")
+        if collated_batch['graph_updater_links']:
+            for link_name, data in collated_batch['graph_updater_links'].items():
+                print(f"  - {link_name}: {data['edge_index'].shape[1]} edges")
+        else:
+            print("  No graph links in this batch.")
+        print("--- End Debug Output ---\n")
+        print("Embedding pool size:", collated_batch["embedding_pool"].shape[0])
+        print("Max name_emb_idx:", collated_batch["token_encoder_inputs"]["name_embed_indices"].max().item())
+        # 2. Model Forward Pass
+        with torch.no_grad():
+            model_outputs = model(collated_batch)
+        quantile_logits = model_outputs["quantile_logits"]
+        hidden_states = model_outputs["hidden_states"]
+        attention_mask = model_outputs["attention_mask"]
+        pooled_states = model_outputs["pooled_states"]
+        print("Model forward pass successful!")
+        # --- 7. Verify Output ---
+        print("\n--- Test Results ---")
+        D_MODEL = model.d_model
+        print(f"Final hidden_states shape: {hidden_states.shape}")
+        print(f"Final attention_mask shape: {attention_mask.shape}")
+        assert hidden_states.shape == (B, L, D_MODEL)
+        assert attention_mask.shape == (B, L)
+        assert hidden_states.dtype == dtype
+        print(f"Output mean (sanity check): {hidden_states.mean().item()}")
+        print(f"Pooled state shape: {pooled_states.shape}")
+        print(f"Quantile logits shape: {quantile_logits.shape}")
+        quantile_grid = quantile_logits.view(B, len(_test_horizons), len(_test_quantiles))
+        print("\n[Quantile Predictions]")
+        for b_idx in range(B):
+            print(f"  Sample {b_idx}:")
+            for h_idx, horizon in enumerate(_test_horizons):
+                row = quantile_grid[b_idx, h_idx]
+                print(f"    Horizon {horizon}s -> " + ", ".join(
+                    f"q={q:.2f}: {row[q_idx].item():.6f}"
+                    for q_idx, q in enumerate(_test_quantiles)
+                ))
+        print("\n✅ **Test Passed!** Full ENCODING pipeline is working.")
+    except Exception as e:
+        print(f"\n❌ Error during pipeline test: {e}")
+        traceback.print_exc()

link_graph.rs ADDED Viewed

	@@ -0,0 +1,2275 @@

+use crate::aggregator::graph_schema::{
+    BundleTradeLink, BurnedLink, CoordinatedActivityLink, CopiedTradeLink, LockedSupplyLink,
+    MintedLink, ProvidedLiquidityLink, SnipedLink, TopTraderOfLink, TransferLink, WhaleOfLink,
+};
+use crate::handlers::constants::{
+    NATIVE_MINT, PROTOCOL_PUMPFUN_LAUNCHPAD, USD1_MINT, USDC_MINT, USDT_MINT,
+};
+use crate::types::{
+    BurnRow, EventPayload, EventType, LiquidityRow, MintRow, SupplyLockRow, TradeRow, TransferRow,
+};
+use anyhow::{Result, anyhow};
+use chrono::Utc;
+use clickhouse::{Client, Row};
+use futures::stream::{self, StreamExt};
+use itertools::Itertools;
+use neo4rs::{BoltType, Graph, query};
+use once_cell::sync::Lazy;
+use serde::Deserialize;
+use solana_sdk::native_token::LAMPORTS_PER_SOL;
+use std::collections::{HashMap, HashSet, VecDeque};
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::time::Duration;
+use tokio::sync::{Mutex, mpsc};
+use tokio::time::sleep;
+use tokio::time::{Instant, MissedTickBehavior, interval};
+use tokio::try_join;
+fn decimals_for_quote(mint: &str) -> u8 {
+    if mint == NATIVE_MINT {
+        9
+    } else if mint == USDC_MINT || mint == USDT_MINT || mint == USD1_MINT {
+        6
+    } else {
+        9 // default assumption if unknown
+    }
+}
+#[derive(Debug)]
+struct LinkGraphConfig {
+    time_window_seconds: u32,
+    copied_trade_window_seconds: i64,
+    sniper_rank_threshold: u64,
+    whale_rank_threshold: u64,
+    min_top_trader_pnl: f32,
+    min_trade_total_usd: f64,
+    ath_price_threshold_usd: f64,
+    window_max_wait_ms: u64,
+    late_slack_ms: u64,
+    chunk_size_large: usize,
+    chunk_size_historical: usize,
+    chunk_size_mint_small: usize,
+    chunk_size_mint_large: usize,
+    chunk_size_token: usize,
+    trade_cache_max_entries: usize,
+    trade_cache_ttl_secs: u32,
+    trade_cache_max_recent: usize,
+    writer_channel_capacity: usize,
+    writer_max_batch_rows: usize,
+    writer_retry_attempts: u32,
+    writer_retry_backoff_ms: u64,
+    ath_fetch_chunk_size: usize,
+    ch_retry_attempts: u32,
+    ch_retry_backoff_ms: u64,
+    ch_fail_fast: bool,
+}
+static LINK_GRAPH_CONFIG: Lazy<LinkGraphConfig> = Lazy::new(|| LinkGraphConfig {
+    time_window_seconds: env_parse("LINK_GRAPH_TIME_WINDOW_SECONDS", 120_u32),
+    copied_trade_window_seconds: env_parse("LINK_GRAPH_COPIED_TRADE_WINDOW_SECONDS", 60_i64),
+    sniper_rank_threshold: env_parse("LINK_GRAPH_SNIPER_RANK_THRESHOLD", 45_u64),
+    whale_rank_threshold: env_parse("LINK_GRAPH_WHALE_RANK_THRESHOLD", 5_u64),
+    min_top_trader_pnl: env_parse("LINK_GRAPH_MIN_TOP_TRADER_PNL", 1.0_f32),
+    min_trade_total_usd: env_parse("LINK_GRAPH_MIN_TRADE_TOTAL_USD", 20.0_f64),
+    ath_price_threshold_usd: env_parse("LINK_GRAPH_ATH_PRICE_THRESHOLD_USD", 0.0002000_f64),
+    window_max_wait_ms: env_parse("LINK_GRAPH_WINDOW_MAX_WAIT_MS", 250_u64),
+    late_slack_ms: env_parse("LINK_GRAPH_LATE_SLACK_MS", 2000_u64),
+    chunk_size_large: env_parse("LINK_GRAPH_CHUNK_SIZE_LARGE", 3000_usize),
+    chunk_size_historical: env_parse("LINK_GRAPH_CHUNK_SIZE_HISTORICAL", 1000_usize),
+    chunk_size_mint_small: env_parse("LINK_GRAPH_CHUNK_SIZE_MINT_SMALL", 1500_usize),
+    chunk_size_mint_large: env_parse("LINK_GRAPH_CHUNK_SIZE_MINT_LARGE", 3000_usize),
+    chunk_size_token: env_parse("LINK_GRAPH_CHUNK_SIZE_TOKEN", 3000_usize),
+    trade_cache_max_entries: env_parse("LINK_GRAPH_TRADE_CACHE_MAX_ENTRIES", 1_000_000_usize),
+    trade_cache_ttl_secs: env_parse("LINK_GRAPH_TRADE_CACHE_TTL_SECS", 600_u32),
+    trade_cache_max_recent: env_parse("LINK_GRAPH_TRADE_CACHE_MAX_RECENT", 16_usize),
+    writer_channel_capacity: env_parse("LINK_GRAPH_WRITER_CHANNEL_CAPACITY", 5000_usize),
+    writer_max_batch_rows: env_parse("LINK_GRAPH_WRITER_MAX_BATCH_ROWS", 1000_usize),
+    writer_retry_attempts: env_parse("LINK_GRAPH_WRITER_RETRY_ATTEMPTS", 3_u32),
+    writer_retry_backoff_ms: env_parse("LINK_GRAPH_WRITER_RETRY_BACKOFF_MS", 250_u64),
+    ath_fetch_chunk_size: env_parse("LINK_GRAPH_ATH_FETCH_CHUNK_SIZE", 500_usize),
+    ch_retry_attempts: env_parse("LINK_GRAPH_CH_RETRY_ATTEMPTS", 3_u32),
+    ch_retry_backoff_ms: env_parse("LINK_GRAPH_CH_RETRY_BACKOFF_MS", 500_u64),
+    ch_fail_fast: env_parse("LINK_GRAPH_CH_FAIL_FAST", true),
+});
+fn env_parse<T: FromStr>(key: &str, default: T) -> T {
+    std::env::var(key)
+        .ok()
+        .and_then(|v| v.parse::<T>().ok())
+        .unwrap_or(default)
+}
+#[derive(Row, Deserialize, Clone)]
+struct FullHistTrade {
+    maker: String,
+    base_address: String,
+    timestamp: u32,
+    signature: String,
+    trade_type: u8,
+    total_usd: f64,
+    slippage: f32,
+}
+enum FollowerLink {
+    Copied(CopiedTradeLink),
+    Coordinated(CoordinatedActivityLink),
+}
+pub struct LinkGraph {
+    db_client: Client,
+    neo4j_client: Arc<Graph>,
+    rx: mpsc::Receiver<EventPayload>,
+    link_graph_depth: Arc<AtomicUsize>,
+    write_lock: Mutex<()>,
+    trade_cache: Arc<Mutex<HashMap<(String, String), CachedPairState>>>,
+    write_sender: mpsc::Sender<WriteJob>,
+    writer_depth: Arc<AtomicUsize>,
+}
+// Global Neo4j write lock to serialize batches across workers and avoid deadlocks.
+static NEO4J_WRITE_LOCK: Lazy<Mutex<()>> = Lazy::new(|| Mutex::new(()));
+#[derive(Row, Deserialize, Debug)]
+struct Ping {
+    alive: u8,
+}
+#[derive(Row, Deserialize, Debug)]
+struct CountResult {
+    count: u64,
+}
+#[derive(Clone, Debug)]
+struct CachedTrade {
+    maker: String,
+    base_address: String,
+    timestamp: u32,
+    signature: String,
+    trade_type: u8,
+    total_usd: f64,
+    slippage: f32,
+}
+#[derive(Debug)]
+struct CachedPairState {
+    first_buy: Option<CachedTrade>,
+    first_sell: Option<CachedTrade>,
+    recent: VecDeque<CachedTrade>,
+    last_seen: u32,
+}
+#[derive(Debug)]
+pub struct WriteJob {
+    query: String,
+    params: Vec<HashMap<String, BoltType>>,
+}
+impl LinkGraph {
+    pub async fn new(
+        db_client: Client,
+        neo4j_client: Arc<Graph>,
+        rx: mpsc::Receiver<EventPayload>,
+        link_graph_depth: Arc<AtomicUsize>,
+        write_sender: mpsc::Sender<WriteJob>,
+        writer_depth: Arc<AtomicUsize>,
+    ) -> Result<Self> {
+        let _: Ping = db_client.query("SELECT 1 as alive").fetch_one().await?;
+        neo4j_client.run(query("MATCH (n) RETURN count(n)")).await?;
+        println!("[WalletGraph] ✔️ Connected to ClickHouse, Neo4j. Listening on channel.");
+        Ok(Self {
+            db_client,
+            neo4j_client,
+            rx,
+            link_graph_depth,
+            write_lock: Mutex::new(()),
+            trade_cache: Arc::new(Mutex::new(HashMap::new())),
+            write_sender,
+        writer_depth,
+        })
+    }
+    async fn with_ch_retry<T, F, Fut>(&self, mut op: F, label: &str) -> Result<T>
+    where
+        F: FnMut() -> Fut,
+        Fut: Future<Output = Result<T>>,
+    {
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let mut attempts = 0;
+        loop {
+            attempts += 1;
+            match op().await {
+                Ok(res) => return Ok(res),
+                Err(e) => {
+                    if attempts >= cfg.ch_retry_attempts {
+                        return Err(anyhow!(
+                            "[LinkGraph] {} failed after {} attempts: {}",
+                            label,
+                            attempts,
+                            e
+                        ));
+                    }
+                    let backoff = cfg.ch_retry_backoff_ms * attempts as u64;
+                    eprintln!(
+                        "[LinkGraph] ⚠️ {} retry {}/{} after {}ms: {}",
+                        label, attempts, cfg.ch_retry_attempts, backoff, e
+                    );
+                    sleep(Duration::from_millis(backoff)).await;
+                }
+            }
+        }
+    }
+    pub async fn run(&mut self) -> Result<()> {
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let mut message_buffer: Vec<EventPayload> = Vec::new();
+        let mut current_window_start: Option<u32> = None;
+        let mut window_opened_at: Option<Instant> = None;
+        let mut flush_check = interval(Duration::from_millis(cfg.window_max_wait_ms.max(50)));
+        flush_check.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        let late_slack_secs: u32 = (cfg.late_slack_ms / 1000) as u32;
+        loop {
+            tokio::select! {
+                maybe_payload = self.rx.recv() => {
+                    match maybe_payload {
+                        Some(payload) => {
+                            // one item left the channel
+                            self.link_graph_depth.fetch_sub(1, Ordering::Relaxed);
+                            if current_window_start.is_none() {
+                                current_window_start = Some(payload.timestamp);
+                                window_opened_at = Some(Instant::now());
+                            }
+                            let window_end = current_window_start.unwrap() + cfg.time_window_seconds;
+                            if payload.timestamp <= window_end + late_slack_secs {
+                                message_buffer.push(payload);
+                            } else {
+                                if !message_buffer.is_empty() {
+                                    message_buffer.sort_by_key(|p| p.timestamp);
+                                    let batch = std::mem::take(&mut message_buffer);
+                                    if let Err(e) = self.process_batch_with_retry(batch).await {
+                                        eprintln!("[LinkGraph] 🔴 Fatal processing window: {}", e);
+                                        std::process::exit(1);
+                                    }
+                                }
+                                current_window_start = Some(payload.timestamp);
+                                window_opened_at = Some(Instant::now());
+                                message_buffer.push(payload);
+                            }
+                        }
+                        None => {
+                            eprintln!("[LinkGraph] 🔴 Input channel closed. Exiting.");
+                            if !message_buffer.is_empty() {
+                                message_buffer.sort_by_key(|p| p.timestamp);
+                                let batch = std::mem::take(&mut message_buffer);
+                                if let Err(e) = self.process_batch_with_retry(batch).await {
+                                    eprintln!("[LinkGraph] 🔴 Fatal processing final window: {}", e);
+                                }
+                            }
+                            // Fatal: the producer is gone. Exit so it's obvious.
+                            std::process::exit(1);
+                        }
+                    }
+                }
+                _ = flush_check.tick() => {
+                    if !message_buffer.is_empty() {
+                        if let Some(opened) = window_opened_at {
+                            if opened.elapsed() >= Duration::from_millis(cfg.window_max_wait_ms) {
+                                message_buffer.sort_by_key(|p| p.timestamp);
+                                let batch = std::mem::take(&mut message_buffer);
+                                if let Err(e) = self.process_batch_with_retry(batch).await {
+                                    eprintln!("[LinkGraph] 🔴 Fatal processing timed window: {}", e);
+                                    std::process::exit(1);
+                                }
+                                current_window_start = None;
+                                window_opened_at = None;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+    async fn process_time_window(&self, payloads: &[EventPayload]) -> Result<()> {
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let mut unique_wallets = HashSet::new();
+        let mut unique_tokens = HashSet::new();
+        let mut trades = Vec::new();
+        let mut transfers = Vec::new();
+        let mut mints = Vec::new();
+        let mut supply_locks = Vec::new();
+        let mut burns = Vec::new();
+        let mut liquidity_events = Vec::new();
+        for payload in payloads {
+            match &payload.event {
+                EventType::Trade(trade) => {
+                    // Skip dust trades to reduce noise in downstream links/datasets
+                    if trade.total_usd >= cfg.min_trade_total_usd {
+                        unique_wallets.insert(trade.maker.clone());
+                        unique_tokens.insert(trade.base_address.clone());
+                        trades.push(trade.clone());
+                    }
+                }
+                EventType::Transfer(transfer) => {
+                    unique_wallets.insert(transfer.source.clone());
+                    unique_wallets.insert(transfer.destination.clone());
+                    transfers.push(transfer.clone());
+                }
+                EventType::Mint(mint) => {
+                    unique_wallets.insert(mint.creator_address.clone());
+                    unique_tokens.insert(mint.mint_address.clone());
+                    mints.push(mint.clone());
+                }
+                EventType::SupplyLock(lock) => {
+                    unique_wallets.insert(lock.sender.clone());
+                    unique_wallets.insert(lock.recipient.clone());
+                    unique_tokens.insert(lock.mint_address.clone());
+                    supply_locks.push(lock.clone());
+                }
+                EventType::Burn(burn) => {
+                    unique_wallets.insert(burn.source.clone());
+                    unique_tokens.insert(burn.mint_address.clone());
+                    burns.push(burn.clone());
+                }
+                EventType::Liquidity(liquidity) => {
+                    if liquidity.change_type == 0 {
+                        // 0 = Add Liquidity
+                        unique_wallets.insert(liquidity.lp_provider.clone());
+                        liquidity_events.push(liquidity.clone());
+                    }
+                }
+                _ => {}
+            }
+        }
+        // Run link detection in parallel; writes remain serialized by the global Neo4j lock.
+        let parallel_start = Instant::now();
+        try_join!(
+            self.process_mints(&mints, &trades),
+            self.process_transfers_and_funding(&transfers),
+            self.process_supply_locks(&supply_locks),
+            self.process_burns(&burns),
+            self.process_liquidity_events(&liquidity_events),
+            self.process_trade_patterns(&trades, &mints),
+        )?;
+        println!(
+            "[LinkGraph] [TimeWindow] Parallel link processing finished in: {:?}",
+            parallel_start.elapsed()
+        );
+        Ok(())
+    }
+    async fn process_batch(&self, mut payloads: Vec<EventPayload>) -> Result<()> {
+        if payloads.is_empty() {
+            return Ok(());
+        }
+        // Payloads are already a complete time-window. We just need to sort them.
+        payloads.sort_by_key(|p| p.timestamp);
+        // Process the entire batch as a single logical unit with a per-worker write lock.
+        let _guard = self.write_lock.lock().await;
+        self.process_time_window(&payloads).await?;
+        println!(
+            "[LinkGraph] Finished processing batch of {} events.",
+            payloads.len()
+        );
+        Ok(())
+    }
+    async fn process_batch_with_retry(&self, payloads: Vec<EventPayload>) -> Result<()> {
+        // Serialize across all workers to avoid Neo4j deadlocks.
+        let _global_lock = NEO4J_WRITE_LOCK.lock().await;
+        let mut attempts = 0;
+        let max_retries = 3;
+        loop {
+            match self.process_batch(payloads.clone()).await {
+                Ok(_) => return Ok(()),
+                Err(e) => {
+                    let err_str = e.to_string();
+                    if err_str.contains("DeadlockDetected") && attempts < max_retries {
+                        attempts += 1;
+                        let backoff_ms = 200 * attempts;
+                        eprintln!(
+                            "[LinkGraph] ⚠️ Deadlock detected, retrying {}/{} after {}ms",
+                            attempts, max_retries, backoff_ms
+                        );
+                        sleep(Duration::from_millis(backoff_ms as u64)).await;
+                        continue;
+                    } else {
+                        return Err(e);
+                    }
+                }
+            }
+        }
+    }
+    // --- Main Logic for Pattern Detection ---
+    fn cached_trade_from_trade(trade: &TradeRow) -> CachedTrade {
+        CachedTrade {
+            maker: trade.maker.clone(),
+            base_address: trade.base_address.clone(),
+            timestamp: trade.timestamp,
+            signature: trade.signature.clone(),
+            trade_type: trade.trade_type,
+            total_usd: trade.total_usd,
+            slippage: trade.slippage,
+        }
+    }
+    async fn update_trade_cache(&self, trades: &[&TradeRow]) -> Result<()> {
+        if trades.is_empty() {
+            return Ok(());
+        }
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let now_ts = trades.iter().map(|t| t.timestamp).max().unwrap_or(0);
+        let cutoff = now_ts.saturating_sub(cfg.trade_cache_ttl_secs);
+        let mut cache = self.trade_cache.lock().await;
+        cache.retain(|_, state| state.last_seen >= cutoff);
+        for trade in trades {
+            let key = (trade.maker.clone(), trade.base_address.clone());
+            let entry = cache.entry(key).or_insert_with(|| CachedPairState {
+                first_buy: None,
+                first_sell: None,
+                recent: VecDeque::new(),
+                last_seen: 0,
+            });
+            entry.last_seen = entry.last_seen.max(trade.timestamp);
+            let ct = Self::cached_trade_from_trade(trade);
+            if trade.trade_type == 0 {
+                if entry
+                    .first_buy
+                    .as_ref()
+                    .map_or(true, |b| ct.timestamp < b.timestamp)
+                {
+                    entry.first_buy = Some(ct.clone());
+                }
+            } else if trade.trade_type == 1 {
+                if entry
+                    .first_sell
+                    .as_ref()
+                    .map_or(true, |s| ct.timestamp < s.timestamp)
+                {
+                    entry.first_sell = Some(ct.clone());
+                }
+            }
+            entry.recent.push_back(ct);
+            while entry.recent.len() > cfg.trade_cache_max_recent {
+                entry.recent.pop_front();
+            }
+            while let Some(front) = entry.recent.front() {
+                if front.timestamp + cfg.trade_cache_ttl_secs < now_ts {
+                    entry.recent.pop_front();
+                } else {
+                    break;
+                }
+            }
+        }
+        if cache.len() > cfg.trade_cache_max_entries {
+            let mut entries: Vec<_> = cache
+                .iter()
+                .map(|(k, v)| (k.clone(), v.last_seen))
+                .collect();
+            entries.sort_by_key(|(_, ts)| *ts);
+            let to_drop = entries.len().saturating_sub(cfg.trade_cache_max_entries);
+            for (key, _) in entries.into_iter().take(to_drop) {
+                cache.remove(&key);
+            }
+        }
+        Ok(())
+    }
+    async fn build_histories_from_cache(
+        &self,
+        pairs: &[(String, String)],
+    ) -> Result<HashMap<(String, String), Vec<FullHistTrade>>> {
+        let mut map = HashMap::new();
+        let cache = self.trade_cache.lock().await;
+        for pair in pairs {
+            if let Some(state) = cache.get(pair) {
+                let mut collected = Vec::new();
+                if let Some(b) = &state.first_buy {
+                    collected.push(Self::cached_to_full(b));
+                }
+                if let Some(s) = &state.first_sell {
+                    collected.push(Self::cached_to_full(s));
+                }
+                for t in state.recent.iter() {
+                    collected.push(Self::cached_to_full(t));
+                }
+                if !collected.is_empty() {
+                    collected.sort_by_key(|t| t.timestamp);
+                    collected.dedup_by(|a, b| a.signature == b.signature);
+                    map.insert(pair.clone(), collected);
+                }
+            }
+        }
+        Ok(map)
+    }
+    fn cached_to_full(ct: &CachedTrade) -> FullHistTrade {
+        FullHistTrade {
+            maker: ct.maker.clone(),
+            base_address: ct.base_address.clone(),
+            timestamp: ct.timestamp,
+            signature: ct.signature.clone(),
+            trade_type: ct.trade_type,
+            total_usd: ct.total_usd,
+            slippage: ct.slippage,
+        }
+    }
+    pub async fn writer_task(
+        mut rx: mpsc::Receiver<WriteJob>,
+        neo4j_client: Arc<Graph>,
+        writer_depth: Arc<AtomicUsize>,
+    ) {
+        let cfg = &*LINK_GRAPH_CONFIG;
+        while let Some(job) = rx.recv().await {
+            writer_depth.fetch_sub(1, Ordering::Relaxed);
+            let batches = job
+                .params
+                .chunks(cfg.writer_max_batch_rows.max(1))
+                .map(|chunk| chunk.to_vec())
+                .collect::<Vec<_>>();
+            for (idx, params) in batches.iter().enumerate() {
+                let q = query(&job.query).param("x", params.clone());
+                let mut attempts = 0;
+                loop {
+                    let start = Instant::now();
+                    match neo4j_client.run(q.clone()).await {
+                        Ok(_) => {
+                            println!(
+                                "[LinkGraph] [Writer] ✅ wrote {} rows (chunk {}/{}) in {:?}",
+                                params.len(),
+                                idx + 1,
+                                batches.len(),
+                                start.elapsed()
+                            );
+                            break;
+                        }
+                        Err(e) => {
+                            let msg = e.to_string();
+                            attempts += 1;
+                            if msg.contains("DeadlockDetected")
+                                && attempts <= cfg.writer_retry_attempts
+                            {
+                                let backoff = cfg.writer_retry_backoff_ms * attempts as u64;
+                                eprintln!(
+                                    "[LinkGraph] [Writer] ⚠️ deadlock, retry {}/{} after {}ms: {}",
+                                    attempts, cfg.writer_retry_attempts, backoff, msg
+                                );
+                                sleep(Duration::from_millis(backoff)).await;
+                                continue;
+                            } else {
+                                eprintln!(
+                                    "[LinkGraph] 🔴 Writer fatal after {} attempts: {}",
+                                    attempts, msg
+                                );
+                                std::process::exit(1);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        eprintln!("[LinkGraph] 🔴 Writer channel closed.");
+        std::process::exit(1);
+    }
+    async fn enqueue_write(
+        &self,
+        cypher: &str,
+        params: Vec<HashMap<String, BoltType>>,
+    ) -> Result<()> {
+        let job = WriteJob {
+            query: cypher.to_string(),
+            params,
+        };
+        self.write_sender
+            .send(job)
+            .await
+            .map_err(|e| anyhow!("[LinkGraph] Failed to enqueue write: {}", e))?;
+        self.writer_depth.fetch_add(1, Ordering::Relaxed);
+        Ok(())
+    }
+    async fn process_mints(
+        &self,
+        mints: &[MintRow],
+        all_trades_in_batch: &[TradeRow],
+    ) -> Result<()> {
+        let start = Instant::now();
+        if mints.is_empty() {
+            return Ok(());
+        }
+        let mut links = Vec::new();
+        for mint in mints {
+            let dev_buy = all_trades_in_batch.iter().find(
+                |t| {
+                    t.maker == mint.creator_address
+                        && t.base_address == mint.mint_address
+                        && t.trade_type == 0
+                }, // 0 = Buy
+            );
+            let buy_amount_decimals = dev_buy.map_or(0.0, |t| {
+                let quote_decimals = decimals_for_quote(&t.quote_address);
+                t.quote_amount as f64 / 10f64.powi(quote_decimals as i32)
+            });
+            links.push(MintedLink {
+                signature: mint.signature.clone(),
+                timestamp: mint.timestamp as i64,
+                buy_amount: buy_amount_decimals,
+            });
+        }
+        self.write_minted_links(&links, mints).await?;
+        println!(
+            "[LinkGraph] [Profile] process_mints: {} mints in {:?}",
+            mints.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn process_supply_locks(&self, locks: &[SupplyLockRow]) -> Result<()> {
+        let start = Instant::now();
+        if locks.is_empty() {
+            return Ok(());
+        }
+        let links: Vec<_> = locks
+            .iter()
+            .map(|l| LockedSupplyLink {
+                signature: l.signature.clone(),
+                amount: l.total_locked_amount as f64,
+                timestamp: l.timestamp as i64,
+                unlock_timestamp: l.final_unlock_timestamp,
+            })
+            .collect();
+        self.write_locked_supply_links(&links, locks).await?;
+        println!(
+            "[LinkGraph] [Profile] process_supply_locks: {} locks in {:?}",
+            locks.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn process_burns(&self, burns: &[BurnRow]) -> Result<()> {
+        let start = Instant::now();
+        if burns.is_empty() {
+            return Ok(());
+        }
+        let links: Vec<_> = burns
+            .iter()
+            .map(|b| BurnedLink {
+                signature: b.signature.clone(),
+                amount: b.amount_decimal,
+                timestamp: b.timestamp as i64,
+            })
+            .collect();
+        self.write_burned_links(&links, burns).await?;
+        println!(
+            "[LinkGraph] [Profile] process_burns: {} burns in {:?}",
+            burns.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn process_transfers_and_funding(&self, transfers: &[TransferRow]) -> Result<()> {
+        let start = Instant::now();
+        if transfers.is_empty() {
+            return Ok(());
+        }
+        // Directly map every TransferRow to a TransferLink without any extra logic.
+        let transfer_links: Vec<TransferLink> = transfers
+            .iter()
+            .map(|transfer| TransferLink {
+                source: transfer.source.clone(),
+                destination: transfer.destination.clone(),
+                signature: transfer.signature.clone(),
+                mint: transfer.mint_address.clone(),
+                timestamp: transfer.timestamp as i64,
+                amount: transfer.amount_decimal,
+            })
+            .collect();
+        self.write_transfer_links(&transfer_links).await?;
+        println!(
+            "[LinkGraph] [Profile] process_transfers: {} transfers in {:?}",
+            transfers.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn process_trade_patterns(
+        &self,
+        trades: &[TradeRow],
+        mints_in_batch: &[MintRow],
+    ) -> Result<()> {
+        let start = Instant::now();
+        if trades.is_empty() {
+            return Ok(());
+        }
+        let creator_map: HashMap<String, String> = mints_in_batch
+            .iter()
+            .map(|m| (m.mint_address.clone(), m.creator_address.clone()))
+            .collect();
+        let mut processed_pairs = HashSet::new();
+        let bundle_links = self.detect_bundle_trades(trades, &mut processed_pairs);
+        if !bundle_links.is_empty() {
+            self.write_bundle_trade_links(&bundle_links).await?;
+        }
+        let follower_links = self
+            .detect_follower_activity(trades, &mut processed_pairs)
+            .await?;
+        if !follower_links.is_empty() {
+            let mut copied_links = Vec::new();
+            let mut coordinated_links = Vec::new();
+            for link in follower_links {
+                match link {
+                    FollowerLink::Copied(l) => copied_links.push(l),
+                    FollowerLink::Coordinated(l) => coordinated_links.push(l),
+                }
+            }
+            if !copied_links.is_empty() {
+                self.write_copied_trade_links(&copied_links).await?;
+            }
+            if !coordinated_links.is_empty() {
+                self.write_coordinated_activity_links(&coordinated_links)
+                    .await?;
+            }
+        }
+        self.detect_and_write_snipes(trades, creator_map).await?;
+        self.detect_and_write_whale_links(trades).await?;
+        self.detect_and_write_top_trader_links(trades).await?;
+        println!(
+            "[LinkGraph] [Profile] process_trade_patterns: {} trades in {:?}",
+            trades.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn detect_and_write_snipes(
+        &self,
+        _trades: &[TradeRow],
+        creator_map: HashMap<String, String>,
+    ) -> Result<()> {
+        let start = Instant::now();
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let mut links: Vec<SnipedLink> = Vec::new();
+        let mut snipers_map: HashMap<String, (String, String)> = HashMap::new();
+        // Limit sniper detection to Pump.fun launchpad trades only.
+        let pump_trades: Vec<&TradeRow> = _trades
+            .iter()
+            .filter(|t| t.protocol == PROTOCOL_PUMPFUN_LAUNCHPAD)
+            .collect();
+        if pump_trades.is_empty() {
+            return Ok(());
+        }
+        let unique_mints: HashSet<String> =
+            pump_trades.iter().map(|t| t.base_address.clone()).collect();
+        if unique_mints.is_empty() {
+            return Ok(());
+        }
+        // This pre-flight check remains the same
+        #[derive(Row, Deserialize, Debug)]
+        struct TokenHolderInfo {
+            token_address: String,
+            unique_holders: u32,
+        }
+        let holder_check_query = "
+            SELECT token_address, unique_holders
+            FROM token_metrics_latest
+            WHERE token_address IN ?
+            ORDER BY token_address, updated_at DESC
+            LIMIT 1 BY token_address
+        ";
+        let mut holder_infos: Vec<TokenHolderInfo> = Vec::new();
+        let unique_mints_vec: Vec<_> = unique_mints.iter().cloned().collect();
+        for chunk in unique_mints_vec.chunks(cfg.chunk_size_large) {
+            let mut chunk_results = self
+                .with_ch_retry(
+                    || async {
+                        self.db_client
+                            .query(holder_check_query)
+                            .bind(chunk)
+                            .fetch_all()
+                            .await
+                            .map_err(anyhow::Error::from)
+                    },
+                    "Snipes-HolderCheck chunk",
+                )
+                .await?;
+            holder_infos.append(&mut chunk_results);
+        }
+        let token_holder_map: HashMap<String, u32> = holder_infos
+            .into_iter()
+            .map(|t| (t.token_address, t.unique_holders))
+            .collect();
+        #[derive(Row, Deserialize, Clone, Debug)]
+        struct SniperInfo {
+            maker: String,
+            first_sig: String,
+            first_total: f64,
+            first_ts: u32,
+        }
+        #[derive(Row, Deserialize, Debug)]
+        struct TokenCreator {
+            creator_address: String,
+        }
+        // OPTIMIZATION: Parallelize the database queries for each mint.
+        let query_futures = unique_mints
+            .into_iter()
+            .filter(|mint| {
+                // Pre-filter mints that are too established
+                let holder_count = token_holder_map.get(mint).cloned().unwrap_or(0);
+                holder_count <= cfg.sniper_rank_threshold as u32
+            })
+            .map(|mint| {
+                let db_client = self.db_client.clone();
+                let creator_map_clone = creator_map.clone();
+                // Create an async block (a future) for each query
+                async move {
+                    let snipers_query = "
+                        SELECT maker,
+                               argMin(signature, timestamp) as first_sig,
+                               argMin(total, timestamp) as first_total,
+                               min(toUInt32(timestamp)) as first_ts
+                        FROM trades WHERE base_address = ? AND trade_type = 0
+                        GROUP BY maker ORDER BY min(timestamp) ASC LIMIT ?
+                    ";
+                    let result = db_client
+                        .query(snipers_query)
+                        .bind(mint.clone()) // Keep this bind
+                        .bind(cfg.sniper_rank_threshold) // And this one
+                        .fetch_all::<SniperInfo>()
+                        .await
+                        .map_err(|e| {
+                            anyhow!(
+                                "[SNIPER_FAIL]: Sniper fetch for mint '{}' failed. Error: {}",
+                                mint,
+                                e
+                            )
+                        });
+                    (mint, result)
+                }
+            });
+        // Execute the futures concurrently with a limit of 20 at a time.
+        let results = stream::iter(query_futures)
+            .buffer_unordered(20) // CONCURRENCY LIMIT
+            .collect::<Vec<_>>()
+            .await;
+        // Process the results after they have all completed
+        for (mint, result) in results {
+            match result {
+                Ok(sniper_candidates) => {
+                    for (i, sniper) in sniper_candidates.iter().enumerate() {
+                        links.push(SnipedLink {
+                            timestamp: sniper.first_ts as i64,
+                            signature: sniper.first_sig.clone(),
+                            rank: (i + 1) as i64,
+                            sniped_amount: sniper.first_total,
+                        });
+                        snipers_map.insert(
+                            sniper.first_sig.clone(),
+                            (sniper.maker.clone(), mint.clone()),
+                        );
+                    }
+                }
+                Err(e) => eprintln!("[Snipers] Error processing mint {}: {}", mint, e),
+            }
+        }
+        if !links.is_empty() {
+            self.write_sniped_links(&links, &snipers_map).await?;
+        }
+        println!(
+            "[LinkGraph] [Profile] detect_and_write_snipes: {} links in {:?}",
+            links.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    fn detect_bundle_trades(
+        &self,
+        trades: &[TradeRow],
+        processed_pairs: &mut HashSet<(String, String)>,
+    ) -> Vec<BundleTradeLink> {
+        let mut links = Vec::new();
+        let trades_by_slot_mint = trades
+            .iter()
+            .into_group_map_by(|t| (t.slot, t.base_address.clone()));
+        for ((slot, mint), trades_in_bundle) in trades_by_slot_mint {
+            let unique_makers: Vec<_> =
+                trades_in_bundle.iter().map(|t| &t.maker).unique().collect();
+            if unique_makers.len() <= 1 {
+                continue;
+            }
+            // Leader Election: Find the trade with the max `quote_amount`.
+            // Includes a deterministic tie-breaker using the wallet address.
+            let leader_trade = match trades_in_bundle.iter().max_by(|a, b| {
+                match a.quote_amount.cmp(&b.quote_amount) {
+                    std::cmp::Ordering::Equal => b.maker.cmp(&a.maker),
+                    other => other,
+                }
+            }) {
+                Some(trade) => trade,
+                None => continue,
+            };
+            let leader_wallet = &leader_trade.maker;
+            let all_bundle_signatures: Vec<String> = trades_in_bundle
+                .iter()
+                .map(|t| t.signature.clone())
+                .collect();
+            for follower_trade in trades_in_bundle
+                .iter()
+                .filter(|t| &t.maker != leader_wallet)
+            {
+                let follower_wallet = &follower_trade.maker;
+                let mut combo_sorted = vec![leader_wallet.clone(), follower_wallet.clone()];
+                combo_sorted.sort();
+                let pair_key = (combo_sorted[0].clone(), combo_sorted[1].clone());
+                // Populate the processed_pairs set and create the link.
+                if processed_pairs.insert(pair_key) {
+                    links.push(BundleTradeLink {
+                        signatures: all_bundle_signatures.clone(),
+                        wallet_a: leader_wallet.clone(),
+                        wallet_b: follower_wallet.clone(),
+                        mint: mint.clone(),
+                        slot: slot as i64,
+                        timestamp: leader_trade.timestamp as i64,
+                    });
+                }
+            }
+        }
+        links
+    }
+    async fn detect_follower_activity(
+        &self,
+        trades: &[TradeRow],
+        processed_pairs: &mut HashSet<(String, String)>,
+    ) -> Result<Vec<FollowerLink>> {
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let mut links = Vec::new();
+        let min_usd_value = cfg.min_trade_total_usd;
+        let significant_trades: Vec<&TradeRow> = trades
+            .iter()
+            .filter(|t| t.total_usd >= min_usd_value)
+            .collect();
+        if significant_trades.len() < 2 {
+            return Ok(links);
+        }
+        let unique_pairs: Vec<(String, String)> = significant_trades
+            .iter()
+            .map(|t| (t.maker.clone(), t.base_address.clone()))
+            .unique()
+            .collect();
+        // Update and read from the bounded in-memory cache; fallback to CH only on misses.
+        self.update_trade_cache(&significant_trades).await?;
+        let mut historical_trades_map = self.build_histories_from_cache(&unique_pairs).await?;
+        let missing_pairs: Vec<(String, String)> = unique_pairs
+            .iter()
+            .filter(|k| !historical_trades_map.contains_key(*k))
+            .cloned()
+            .collect();
+        if !missing_pairs.is_empty() {
+            let historical_query = "
+                SELECT maker, base_address, toUnixTimestamp(timestamp) as timestamp, signature, trade_type, total_usd, slippage
+                FROM trades
+                WHERE (maker, base_address) IN ?
+            ";
+            for chunk in missing_pairs.chunks(cfg.chunk_size_historical) {
+                let chunk_results: Vec<FullHistTrade> = self
+                    .db_client
+                    .query(historical_query)
+                    .bind(chunk)
+                    .fetch_all()
+                    .await
+                    .map_err(|e| {
+                        anyhow!(
+                            "[FOLLOWER_FAIL]: Historical trade fetch failed. Error: {}",
+                            e
+                        )
+                    })?;
+                for trade in chunk_results {
+                    historical_trades_map
+                        .entry((trade.maker.clone(), trade.base_address.clone()))
+                        .or_default()
+                        .push(trade);
+                }
+            }
+        }
+        let trades_by_mint = significant_trades
+            .into_iter()
+            .into_group_map_by(|t| t.base_address.clone());
+        for (mint, trades_in_batch) in trades_by_mint {
+            if trades_in_batch.len() < 2 {
+                continue;
+            }
+            let Some(leader_trade) = trades_in_batch.iter().min_by_key(|t| t.timestamp) else {
+                continue;
+            };
+            let leader_wallet = &leader_trade.maker;
+            for follower_trade in trades_in_batch.iter().filter(|t| &t.maker != leader_wallet) {
+                let follower_wallet = &follower_trade.maker;
+                let mut pair_key_vec = vec![leader_wallet.to_string(), follower_wallet.to_string()];
+                pair_key_vec.sort();
+                let pair_key = (pair_key_vec[0].clone(), pair_key_vec[1].clone());
+                if processed_pairs.contains(&pair_key) {
+                    continue;
+                }
+                if let (Some(leader_hist_ref), Some(follower_hist_ref)) = (
+                    historical_trades_map.get(&(leader_wallet.clone(), mint.clone())),
+                    historical_trades_map.get(&(follower_wallet.clone(), mint.clone())),
+                ) {
+                    let mut leader_hist = leader_hist_ref.clone();
+                    let mut follower_hist = follower_hist_ref.clone();
+                    leader_hist.sort_by_key(|t| t.timestamp);
+                    follower_hist.sort_by_key(|t| t.timestamp);
+                    let leader_first_trade = leader_hist.get(0);
+                    let follower_first_trade = follower_hist.get(0);
+                    // --- THE CRITICAL FIX ---
+                    // Base the decision on the very first interaction.
+                    if let (Some(l1), Some(f1)) = (leader_first_trade, follower_first_trade) {
+                        let first_gap = (f1.timestamp as i64 - l1.timestamp as i64).abs();
+                        if first_gap > 0 && first_gap <= cfg.copied_trade_window_seconds {
+                            processed_pairs.insert(pair_key); // Process this pair only once
+                            // A) If the FIRST trades are BOTH BUYS, it's a COPIED_TRADE.
+                            if l1.trade_type == 0 && f1.trade_type == 0 {
+                                let l_buy = l1; // Already have the first buy
+                                let f_buy = f1; // Already have the first buy
+                                let leader_sells: Vec<_> =
+                                    leader_hist.iter().filter(|t| t.trade_type == 1).collect();
+                                let follower_sells: Vec<_> =
+                                    follower_hist.iter().filter(|t| t.trade_type == 1).collect();
+                                let leader_sell_total: f64 =
+                                    leader_sells.iter().map(|t| t.total_usd).sum();
+                                let follower_sell_total: f64 =
+                                    follower_sells.iter().map(|t| t.total_usd).sum();
+                                let leader_pnl = if l_buy.total_usd > 0.0 {
+                                    (leader_sell_total - l_buy.total_usd) / l_buy.total_usd
+                                } else {
+                                    0.0
+                                };
+                                let follower_pnl = if f_buy.total_usd > 0.0 {
+                                    (follower_sell_total - f_buy.total_usd) / f_buy.total_usd
+                                } else {
+                                    0.0
+                                };
+                                let leader_first_sell =
+                                    leader_sells.iter().min_by_key(|t| t.timestamp);
+                                let follower_first_sell =
+                                    follower_sells.iter().min_by_key(|t| t.timestamp);
+                                let (sell_gap, l_sell_sig, f_sell_sig, f_sell_slip) =
+                                    if let (Some(l_sell), Some(f_sell)) =
+                                        (leader_first_sell, follower_first_sell)
+                                    {
+                                        (
+                                            (f_sell.timestamp as i64 - l_sell.timestamp as i64)
+                                                .abs(),
+                                            l_sell.signature.clone(),
+                                            f_sell.signature.clone(),
+                                            f_sell.slippage,
+                                        )
+                                    } else {
+                                        (0, "".to_string(), "".to_string(), 0.0)
+                                    };
+                                links.push(FollowerLink::Copied(CopiedTradeLink {
+                                    timestamp: f_buy.timestamp as i64,
+                                    follower: follower_wallet.clone(),
+                                    leader: leader_wallet.clone(),
+                                    mint: mint.clone(),
+                                    time_gap_on_buy_sec: first_gap, // Use the already calculated gap
+                                    time_gap_on_sell_sec: sell_gap,
+                                    leader_pnl,
+                                    follower_pnl,
+                                    leader_buy_sig: l_buy.signature.clone(),
+                                    leader_sell_sig: l_sell_sig,
+                                    follower_buy_sig: f_buy.signature.clone(),
+                                    follower_sell_sig: f_sell_sig,
+                                    leader_buy_total: l_buy.total_usd,
+                                    leader_sell_total,
+                                    follower_buy_total: f_buy.total_usd,
+                                    follower_sell_total,
+                                    follower_buy_slippage: f_buy.slippage,
+                                    follower_sell_slippage: f_sell_slip,
+                                }));
+                            }
+                            // B) ELSE, if the first trades are not both buys, it's a COORDINATED_ACTIVITY.
+                            else {
+                                let leader_second_trade = leader_hist.get(1);
+                                let follower_second_trade = follower_hist.get(1);
+                                let (l2_sig, f2_sig, second_gap) = if let (Some(l2), Some(f2)) =
+                                    (leader_second_trade, follower_second_trade)
+                                {
+                                    (
+                                        l2.signature.clone(),
+                                        f2.signature.clone(),
+                                        (f2.timestamp as i64 - l2.timestamp as i64).abs(),
+                                    )
+                                } else {
+                                    ("".to_string(), "".to_string(), 0)
+                                };
+                                links.push(FollowerLink::Coordinated(CoordinatedActivityLink {
+                                    timestamp: l1.timestamp as i64,
+                                    leader: leader_wallet.clone(),
+                                    follower: follower_wallet.clone(),
+                                    mint: mint.clone(),
+                                    leader_first_sig: l1.signature.clone(),
+                                    follower_first_sig: f1.signature.clone(),
+                                    time_gap_on_first_sec: first_gap,
+                                    leader_second_sig: l2_sig,
+                                    follower_second_sig: f2_sig,
+                                    time_gap_on_second_sec: second_gap,
+                                }));
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        Ok(links)
+    }
+    async fn detect_and_write_top_trader_links(&self, trades: &[TradeRow]) -> Result<()> {
+        let start = Instant::now();
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let active_trader_pairs: Vec<(String, String)> = trades
+            .iter()
+            .map(|t| (t.maker.clone(), t.base_address.clone()))
+            .unique()
+            .collect();
+        if active_trader_pairs.is_empty() {
+            return Ok(());
+        }
+        // --- NEW: CONFIDENCE FILTER ---
+        // 1. Get all unique mints from the active pairs.
+        let unique_mints: Vec<String> = active_trader_pairs
+            .iter()
+            .map(|(_, mint)| mint.clone())
+            .unique()
+            .collect();
+        #[derive(Row, Deserialize, Debug)]
+        struct MintCheck {
+            mint_address: String,
+        }
+        let mint_query = "SELECT DISTINCT mint_address FROM mints WHERE mint_address IN ?";
+        let mut fully_tracked_mints = HashSet::new();
+        let mint_chunk_small = cfg.chunk_size_mint_small;
+        for chunk in unique_mints.chunks(mint_chunk_small) {
+            let chunk_rows: Vec<MintCheck> = self
+                .with_ch_retry(
+                    || async {
+                        self.db_client
+                            .query(mint_query)
+                            .bind(chunk)
+                            .fetch_all()
+                            .await
+                            .map_err(anyhow::Error::from)
+                    },
+                    "TopTrader mint check chunk",
+                )
+                .await?;
+            for mint_row in chunk_rows {
+                fully_tracked_mints.insert(mint_row.mint_address);
+            }
+        }
+        // 2. Filter the active pairs to only include those for fully tracked tokens.
+        let confident_trader_pairs: Vec<(String, String)> = active_trader_pairs
+            .into_iter()
+            .filter(|(_, mint)| fully_tracked_mints.contains(mint))
+            .collect();
+        if confident_trader_pairs.is_empty() {
+            return Ok(());
+        }
+        // --- END CONFIDENCE FILTER ---
+        let mints_to_query: Vec<String> = fully_tracked_mints.iter().cloned().collect();
+        if mints_to_query.is_empty() {
+            return Ok(());
+        }
+        let ath_map = self
+            .fetch_latest_ath_map_with_retry(&mints_to_query)
+            .await?;
+        if ath_map.is_empty() {
+            return Ok(());
+        }
+        #[derive(Row, Deserialize, Debug)]
+        struct TraderContextInfo {
+            wallet_address: String,
+            mint_address: String,
+            realized_profit_pnl: f32,
+        }
+        let pnl_query = "
+            SELECT
+                wh.wallet_address, wh.mint_address, wh.realized_profit_pnl
+            FROM wallet_holdings_latest AS wh
+            WHERE wh.mint_address IN ?
+              AND wh.realized_profit_pnl > ?
+            QUALIFY ROW_NUMBER() OVER (PARTITION BY wh.mint_address ORDER BY wh.realized_profit_pnl DESC) = 1
+        ";
+        let mut top_traders: Vec<TraderContextInfo> = Vec::new();
+        for chunk in mints_to_query.chunks(cfg.chunk_size_mint_large) {
+            let chunk_results = self
+                .db_client
+                .query(pnl_query)
+                .bind(chunk)
+                .bind(cfg.min_top_trader_pnl)
+                .fetch_all()
+                .await
+                .map_err(|e| anyhow!("[TOPTRADER_FAIL]: Top-1 PNL fetch failed. Error: {}", e))?;
+            top_traders.extend(chunk_results);
+        }
+        let links: Vec<TopTraderOfLink> = top_traders
+            .into_iter()
+            .filter_map(|trader| {
+                ath_map
+                    .get(&trader.mint_address)
+                    .filter(|ath| **ath >= cfg.ath_price_threshold_usd)
+                    .map(|ath| TopTraderOfLink {
+                        timestamp: Utc::now().timestamp(),
+                        wallet: trader.wallet_address,
+                        token: trader.mint_address,
+                        pnl_at_creation: trader.realized_profit_pnl as f64,
+                        ath_usd_at_creation: *ath,
+                    })
+            })
+            .collect();
+        if !links.is_empty() {
+            self.write_top_trader_of_links(&links).await?;
+        }
+        println!(
+            "[LinkGraph] [Profile] detect_and_write_top_trader_links: {} links in {:?}",
+            links.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn process_liquidity_events(&self, liquidity_adds: &[LiquidityRow]) -> Result<()> {
+        let cfg = &*LINK_GRAPH_CONFIG;
+        if liquidity_adds.is_empty() {
+            return Ok(());
+        }
+        let unique_pools: HashSet<String> = liquidity_adds
+            .iter()
+            .map(|l| l.pool_address.clone())
+            .collect();
+        if unique_pools.is_empty() {
+            return Ok(());
+        }
+        #[derive(Row, Deserialize, Debug)]
+        struct PoolInfo {
+            pool_address: String,
+            base_address: String,
+            base_decimals: Option<u8>,
+            quote_decimals: Option<u8>,
+        }
+        let pool_query = "SELECT pool_address, base_address, base_decimals, quote_decimals FROM pool_creations WHERE pool_address IN ?";
+        let mut pools_info: Vec<PoolInfo> = Vec::new();
+        let unique_pools_vec: Vec<_> = unique_pools.iter().cloned().collect();
+        for chunk in unique_pools_vec.chunks(cfg.chunk_size_large) {
+            let mut chunk_results = self
+                .db_client
+                .query(pool_query)
+                .bind(chunk)
+                .fetch_all()
+                .await
+                .map_err(|e| anyhow!("[LIQUIDITY_FAIL]: PoolQuery chunk failed. Error: {}", e))?;
+            pools_info.append(&mut chunk_results);
+        }
+        let pool_to_token_map: HashMap<String, (String, Option<u8>, Option<u8>)> = pools_info
+            .into_iter()
+            .map(|p| {
+                (
+                    p.pool_address,
+                    (p.base_address, p.base_decimals, p.quote_decimals),
+                )
+            })
+            .collect();
+        let links: Vec<_> = liquidity_adds
+            .iter()
+            .filter_map(|l| {
+                pool_to_token_map.get(&l.pool_address).map(
+                    |(token_address, base_decimals, quote_decimals)| {
+                        let base_scale = 10f64.powi(base_decimals.unwrap_or(0) as i32);
+                        let quote_scale = 10f64.powi(quote_decimals.unwrap_or(0) as i32);
+                        ProvidedLiquidityLink {
+                            signature: l.signature.clone(),
+                            wallet: l.lp_provider.clone(),
+                            token: token_address.clone(),
+                            pool_address: l.pool_address.clone(),
+                            amount_base: l.base_amount as f64 / base_scale,
+                            amount_quote: l.quote_amount as f64 / quote_scale,
+                            timestamp: l.timestamp as i64,
+                        }
+                    },
+                )
+            })
+            .collect();
+        if !links.is_empty() {
+            self.write_provided_liquidity_links(&links).await?;
+        }
+        Ok(())
+    }
+    async fn detect_and_write_whale_links(&self, trades: &[TradeRow]) -> Result<()> {
+        let start = Instant::now();
+        let cfg = &*LINK_GRAPH_CONFIG;
+        let unique_mints_in_batch: Vec<String> = trades
+            .iter()
+            .map(|t| t.base_address.clone())
+            .unique()
+            .collect();
+        if unique_mints_in_batch.is_empty() {
+            return Ok(());
+        }
+        // --- NEW: CONFIDENCE FILTER ---
+        // 1. Check which of the mints in the batch have a creation event in our DB.
+        #[derive(Row, Deserialize, Debug)]
+        struct MintCheck {
+            mint_address: String,
+        }
+        let mint_query = "SELECT DISTINCT mint_address FROM mints WHERE mint_address IN ?";
+        let mut fully_tracked_mints = HashSet::new();
+        for chunk in unique_mints_in_batch.chunks(cfg.chunk_size_mint_large) {
+            let chunk_rows: Vec<MintCheck> = self
+                .with_ch_retry(
+                    || async {
+                        self.db_client
+                            .query(mint_query)
+                            .bind(chunk)
+                            .fetch_all()
+                            .await
+                            .map_err(anyhow::Error::from)
+                    },
+                    "Whale mint check chunk",
+                )
+                .await?;
+            for mint_row in chunk_rows {
+                fully_tracked_mints.insert(mint_row.mint_address);
+            }
+        }
+        if fully_tracked_mints.is_empty() {
+            return Ok(());
+        }
+        let confident_mints: Vec<String> = fully_tracked_mints.iter().cloned().collect();
+        let ath_map = self
+            .fetch_latest_ath_map_with_retry(&confident_mints)
+            .await?;
+        if ath_map.is_empty() {
+            return Ok(());
+        }
+        // --- END CONFIDENCE FILTER ---
+        #[derive(Row, Deserialize, Debug)]
+        struct TokenInfo {
+            token_address: String,
+            total_supply: u64,
+            decimals: u8,
+        }
+        let token_query = "SELECT token_address, total_supply, decimals FROM tokens FINAL WHERE token_address IN ?";
+        // --- RE-INTRODUCED CHUNKING for the token pre-filter ---
+        let mut context_map: HashMap<String, (u64, f64, u8)> = HashMap::new();
+        for chunk in confident_mints.chunks(cfg.chunk_size_token) {
+            let mut attempts = 0;
+            loop {
+                attempts += 1;
+                let result: Result<Vec<TokenInfo>> = self
+                    .db_client
+                    .query(token_query)
+                    .bind(chunk)
+                    .fetch_all()
+                    .await
+                    .map_err(anyhow::Error::from);
+                match result {
+                    Ok(chunk_results) => {
+                        for token in chunk_results {
+                            if let Some(ath) = ath_map.get(&token.token_address) {
+                                if *ath >= cfg.ath_price_threshold_usd {
+                                    context_map.insert(
+                                        token.token_address,
+                                        (token.total_supply, *ath, token.decimals),
+                                    );
+                                }
+                            }
+                        }
+                        break;
+                    }
+                    Err(e) => {
+                        if attempts >= cfg.ch_retry_attempts {
+                            return Err(anyhow!(
+                                "[WHALE_FAIL]: Token pre-filter chunk failed after {} attempts: {}",
+                                attempts,
+                                e
+                            ));
+                        }
+                        let backoff = cfg.ch_retry_backoff_ms * attempts as u64;
+                        eprintln!(
+                            "[LinkGraph] ⚠️ Whale token pre-filter retry {}/{} after {}ms: {}",
+                            attempts, cfg.ch_retry_attempts, backoff, e
+                        );
+                        sleep(Duration::from_millis(backoff)).await;
+                    }
+                }
+            }
+        }
+        // --- END CHUNKING ---
+        if context_map.is_empty() {
+            return Ok(());
+        }
+        let tokens_to_query: Vec<String> = context_map.keys().cloned().collect();
+        #[derive(Row, Deserialize, Debug)]
+        struct WhaleInfo {
+            wallet_address: String,
+            mint_address: String,
+            current_balance: f64,
+        }
+        let whales_query = "
+            SELECT wallet_address, mint_address, current_balance
+            FROM wallet_holdings_latest
+            WHERE mint_address IN ? AND current_balance > 0
+            QUALIFY ROW_NUMBER() OVER (PARTITION BY mint_address ORDER BY current_balance DESC) <= ?
+        ";
+        // --- RE-INTRODUCED CHUNKING for the main whale query ---
+        let mut top_holders: Vec<WhaleInfo> = Vec::new();
+        for chunk in tokens_to_query.chunks(cfg.chunk_size_token) {
+            let mut attempts = 0;
+            loop {
+                attempts += 1;
+                let result: Result<Vec<WhaleInfo>> = self
+                    .db_client
+                    .query(whales_query)
+                    .bind(chunk)
+                    .bind(cfg.whale_rank_threshold)
+                    .fetch_all()
+                    .await
+                    .map_err(anyhow::Error::from);
+                match result {
+                    Ok(chunk_results) => {
+                        top_holders.extend(chunk_results);
+                        break;
+                    }
+                    Err(e) => {
+                        if attempts >= cfg.ch_retry_attempts {
+                            return Err(anyhow!(
+                                "[WHALE_FAIL]: Holder query chunk failed after {} attempts: {}",
+                                attempts,
+                                e
+                            ));
+                        }
+                        let backoff = cfg.ch_retry_backoff_ms * attempts as u64;
+                        eprintln!(
+                            "[LinkGraph] ⚠️ Whale holder chunk retry {}/{} after {}ms: {}",
+                            attempts, cfg.ch_retry_attempts, backoff, e
+                        );
+                        sleep(Duration::from_millis(backoff)).await;
+                    }
+                }
+            }
+        }
+        // --- END CHUNKING ---
+        let mut links = Vec::new();
+        for holder in top_holders {
+            if let Some((raw_total_supply, ath_usd, decimals)) =
+                context_map.get(&holder.mint_address)
+            {
+                if *raw_total_supply == 0 {
+                    continue;
+                }
+                // --- THE FIX ---
+                // Adjust the total supply to be human-readable before dividing.
+                let human_total_supply = *raw_total_supply as f64 / 10f64.powi(*decimals as i32);
+                if human_total_supply == 0.0 {
+                    continue;
+                }
+                // --- END FIX ---
+                let holding_pct = (holder.current_balance / human_total_supply) as f32;
+                links.push(WhaleOfLink {
+                    timestamp: Utc::now().timestamp(),
+                    wallet: holder.wallet_address.clone(),
+                    token: holder.mint_address.clone(),
+                    holding_pct_at_creation: holding_pct,
+                    ath_usd_at_creation: *ath_usd,
+                });
+            }
+        }
+        if !links.is_empty() {
+            self.write_whale_of_links(&links).await?;
+        }
+        println!(
+            "[LinkGraph] [Profile] detect_and_write_whale_links: {} links in {:?}",
+            links.len(),
+            start.elapsed()
+        );
+        Ok(())
+    }
+    async fn create_wallet_nodes(&self, wallets: &HashSet<String>) -> Result<()> {
+        if wallets.is_empty() {
+            return Ok(());
+        }
+        let cfg = &*LINK_GRAPH_CONFIG;
+        // Convert the HashSet to a Vec to be able to create chunks
+        let wallet_vec: Vec<_> = wallets.iter().cloned().collect();
+        // Process the wallets in smaller, manageable chunks
+        for chunk in wallet_vec.chunks(cfg.chunk_size_large) {
+            let params: Vec<_> = chunk
+                .iter()
+                .map(|addr| HashMap::from([("address".to_string(), BoltType::from(addr.clone()))]))
+                .collect();
+            let cypher = "
+                UNWIND $wallets as wallet
+                MERGE (w:Wallet {address: wallet.address})
+            ";
+            self.enqueue_write(cypher, params).await?;
+        }
+        Ok(())
+    }
+    async fn create_token_nodes(&self, tokens: &HashSet<String>) -> Result<()> {
+        if tokens.is_empty() {
+            return Ok(());
+        }
+        let cfg = &*LINK_GRAPH_CONFIG;
+        // Convert the HashSet to a Vec to be able to create chunks
+        let token_vec: Vec<_> = tokens.iter().cloned().collect();
+        // Process the tokens in smaller, manageable chunks
+        for chunk in token_vec.chunks(cfg.chunk_size_large) {
+            let params: Vec<_> = chunk
+                .iter()
+                .map(|addr| HashMap::from([("address".to_string(), BoltType::from(addr.clone()))]))
+                .collect();
+            let cypher = "
+                UNWIND $tokens as token
+                MERGE (t:Token {address: token.address})
+                ON CREATE SET t.created_ts = token.created_ts
+            ";
+            self.enqueue_write(cypher, params).await?;
+        }
+        Ok(())
+    }
+    async fn write_bundle_trade_links(&self, links: &[BundleTradeLink]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let params: Vec<_> = links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("wa".to_string(), BoltType::from(l.wallet_a.clone())),
+                    ("wb".to_string(), BoltType::from(l.wallet_b.clone())),
+                    ("mint".to_string(), BoltType::from(l.mint.clone())),
+                    ("slot".to_string(), BoltType::from(l.slot)),
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                    (
+                        "signatures".to_string(),
+                        BoltType::from(l.signatures.clone()),
+                    ),
+                ])
+            })
+            .collect();
+        // Corrected relationship name to BUNDLE_TRADE for consistency
+        let cypher = "
+            UNWIND $x as t
+            MERGE (a:Wallet {address: t.wa})
+            MERGE (b:Wallet {address: t.wb})
+            MERGE (a)-[r:BUNDLE_TRADE {mint: t.mint, slot: t.slot}]->(b)
+            ON CREATE SET r.timestamp = t.timestamp, r.signatures = t.signatures
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_transfer_links(&self, links: &[TransferLink]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        // --- THE FIX ---
+        // Use `unique_by` to get the *entire first link object* for each unique path.
+        // This preserves the signature and timestamp from the first event we see.
+        let unique_links = links
+            .iter()
+            .unique_by(|l| (&l.source, &l.destination, &l.mint))
+            .collect::<Vec<_>>();
+        // Now build the parameters with the full data from the unique links.
+        let params: Vec<_> = unique_links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("source".to_string(), BoltType::from(l.source.clone())),
+                    (
+                        "destination".to_string(),
+                        BoltType::from(l.destination.clone()),
+                    ),
+                    ("mint".to_string(), BoltType::from(l.mint.clone())),
+                    ("signature".to_string(), BoltType::from(l.signature.clone())), // Include the signature
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)), // Include the on-chain timestamp
+                    ("amount".to_string(), BoltType::from(l.amount)),
+                ])
+            })
+            .collect();
+        // --- UPDATED CYPHER QUERY ---
+        // The query now sets the signature and on-chain timestamp on the link when it's first created.
+        let cypher = "
+            UNWIND $x as t
+            MERGE (s:Wallet {address: t.source})
+            MERGE (d:Wallet {address: t.destination})
+            MERGE (s)-[r:TRANSFERRED_TO {mint: t.mint}]->(d)
+            ON CREATE SET
+                r.signature = t.signature,
+                r.timestamp = t.timestamp,
+                r.amount = t.amount
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_coordinated_activity_links(
+        &self,
+        links: &[CoordinatedActivityLink],
+    ) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let params: Vec<_> = links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("leader".to_string(), BoltType::from(l.leader.clone())),
+                    ("follower".to_string(), BoltType::from(l.follower.clone())),
+                    ("mint".to_string(), BoltType::from(l.mint.clone())),
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                    // Use the new, correct field names
+                    (
+                        "l_sig_1".to_string(),
+                        BoltType::from(l.leader_first_sig.clone()),
+                    ),
+                    (
+                        "l_sig_2".to_string(),
+                        BoltType::from(l.leader_second_sig.clone()),
+                    ),
+                    (
+                        "f_sig_1".to_string(),
+                        BoltType::from(l.follower_first_sig.clone()),
+                    ),
+                    (
+                        "f_sig_2".to_string(),
+                        BoltType::from(l.follower_second_sig.clone()),
+                    ),
+                    ("gap_1".to_string(), BoltType::from(l.time_gap_on_first_sec)),
+                    (
+                        "gap_2".to_string(),
+                        BoltType::from(l.time_gap_on_second_sec),
+                    ),
+                ])
+            })
+            .collect();
+        // This query now creates a single, comprehensive link per pair/mint
+        let cypher = "
+            UNWIND $x as t
+            MERGE (l:Wallet {address: t.leader})
+            MERGE (f:Wallet {address: t.follower})
+            MERGE (f)-[r:COORDINATED_ACTIVITY {mint: t.mint}]->(l)
+            ON CREATE SET
+                r.timestamp = t.timestamp,
+                r.leader_first_sig = t.l_sig_1,
+                r.leader_second_sig = t.l_sig_2,
+                r.follower_first_sig = t.f_sig_1,
+                r.follower_second_sig = t.f_sig_2,
+                r.time_gap_on_first_sec = t.gap_1,
+                r.time_gap_on_second_sec = t.gap_2
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_copied_trade_links(&self, links: &[CopiedTradeLink]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        // This uses the latest struct definition provided in the prompt.
+        let params: Vec<_> = links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("follower".to_string(), BoltType::from(l.follower.clone())),
+                    ("leader".to_string(), BoltType::from(l.leader.clone())),
+                    ("mint".to_string(), BoltType::from(l.mint.clone())),
+                    ("buy_gap".to_string(), BoltType::from(l.time_gap_on_buy_sec)),
+                    (
+                        "sell_gap".to_string(),
+                        BoltType::from(l.time_gap_on_sell_sec),
+                    ),
+                    ("leader_pnl".to_string(), BoltType::from(l.leader_pnl)),
+                    ("follower_pnl".to_string(), BoltType::from(l.follower_pnl)),
+                    (
+                        "l_buy_sig".to_string(),
+                        BoltType::from(l.leader_buy_sig.clone()),
+                    ),
+                    (
+                        "l_sell_sig".to_string(),
+                        BoltType::from(l.leader_sell_sig.clone()),
+                    ),
+                    (
+                        "f_buy_sig".to_string(),
+                        BoltType::from(l.follower_buy_sig.clone()),
+                    ),
+                    (
+                        "f_sell_sig".to_string(),
+                        BoltType::from(l.follower_sell_sig.clone()),
+                    ),
+                    (
+                        "l_buy_total".to_string(),
+                        BoltType::from(l.leader_buy_total),
+                    ),
+                    (
+                        "l_sell_total".to_string(),
+                        BoltType::from(l.leader_sell_total),
+                    ),
+                    (
+                        "f_buy_total".to_string(),
+                        BoltType::from(l.follower_buy_total),
+                    ),
+                    (
+                        "f_sell_total".to_string(),
+                        BoltType::from(l.follower_sell_total),
+                    ),
+                    (
+                        "f_buy_slip".to_string(),
+                        BoltType::from(l.follower_buy_slippage),
+                    ),
+                    (
+                        "f_sell_slip".to_string(),
+                        BoltType::from(l.follower_sell_slippage),
+                    ),
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                ])
+            })
+            .collect();
+        let cypher = "
+            UNWIND $x as t
+            MERGE (f:Wallet {address: t.follower})
+            MERGE (l:Wallet {address: t.leader})
+            MERGE (f)-[r:COPIED_TRADE {mint: t.mint}]->(l)
+            ON CREATE SET
+                r.timestamp = t.timestamp,
+                r.follower = t.follower,
+                r.leader = t.leader,
+                r.mint = t.mint,
+                r.buy_gap = t.buy_gap,
+                r.sell_gap = t.sell_gap,
+                r.leader_pnl = t.leader_pnl,
+                r.follower_pnl = t.follower_pnl,
+                r.l_buy_sig = t.l_buy_sig,
+                r.l_sell_sig = t.l_sell_sig,
+                r.f_buy_sig = t.f_buy_sig,
+                r.f_sell_sig = t.f_sell_sig,
+                r.l_buy_total = t.l_buy_total,
+                r.l_sell_total = t.l_sell_total,
+                r.f_buy_total = t.f_buy_total,
+                r.f_sell_total = t.f_sell_total,
+                r.f_buy_slip = t.f_buy_slip,
+                r.f_sell_slip = t.f_sell_slip
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_minted_links(&self, links: &[MintedLink], mints: &[MintRow]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let mint_map: HashMap<_, _> = mints.iter().map(|m| (m.signature.clone(), m)).collect();
+        let params: Vec<_> = links
+            .iter()
+            .filter_map(|l| {
+                mint_map.get(&l.signature).map(|m| {
+                    HashMap::from([
+                        (
+                            "creator".to_string(),
+                            BoltType::from(m.creator_address.clone()),
+                        ),
+                        ("token".to_string(), BoltType::from(m.mint_address.clone())),
+                        ("signature".to_string(), BoltType::from(l.signature.clone())),
+                        ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                        ("buy_amount".to_string(), BoltType::from(l.buy_amount)),
+                    ])
+                })
+            })
+            .collect();
+        if params.is_empty() {
+            return Ok(());
+        }
+        // --- MODIFIED: MERGE on the signature for idempotency ---
+        let cypher = "
+            UNWIND $x as t
+            MERGE (c:Wallet {address: t.creator})
+            MERGE (k:Token {address: t.token})
+            MERGE (c)-[r:MINTED {signature: t.signature}]->(k)
+            ON CREATE SET r.timestamp = t.timestamp, r.buy_amount = t.buy_amount
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_sniped_links(
+        &self,
+        links: &[SnipedLink],
+        snipers: &HashMap<String, (String, String)>,
+    ) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let params: Vec<_> = links
+            .iter()
+            .filter_map(|l| {
+                snipers.get(&l.signature).map(|(wallet, token)| {
+                    HashMap::from([
+                        ("wallet".to_string(), BoltType::from(wallet.clone())),
+                        ("token".to_string(), BoltType::from(token.clone())),
+                        ("signature".to_string(), BoltType::from(l.signature.clone())),
+                        ("rank".to_string(), BoltType::from(l.rank)),
+                        ("sniped_amount".to_string(), BoltType::from(l.sniped_amount)),
+                        ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                    ])
+                })
+            })
+            .collect();
+        if params.is_empty() {
+            return Ok(());
+        }
+        // --- MODIFIED: MERGE on signature ---
+        let cypher = "
+            UNWIND $x as t
+            MERGE (w:Wallet {address: t.wallet})
+            MERGE (k:Token {address: t.token})
+            MERGE (w)-[r:SNIPED {signature: t.signature}]->(k)
+            ON CREATE SET r.rank = t.rank, r.sniped_amount = t.sniped_amount, r.timestamp = t.timestamp
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_locked_supply_links(
+        &self,
+        links: &[LockedSupplyLink],
+        locks: &[SupplyLockRow],
+    ) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let lock_map: HashMap<_, _> = locks.iter().map(|l| (l.signature.clone(), l)).collect();
+        let params: Vec<_> = links
+            .iter()
+            .filter_map(|l| {
+                lock_map.get(&l.signature).map(|lock_row| {
+                    HashMap::from([
+                        (
+                            "sender".to_string(),
+                            BoltType::from(lock_row.sender.clone()),
+                        ),
+                        (
+                            "recipient".to_string(),
+                            BoltType::from(lock_row.recipient.clone()),
+                        ),
+                        (
+                            "mint".to_string(),
+                            BoltType::from(lock_row.mint_address.clone()),
+                        ),
+                        ("signature".to_string(), BoltType::from(l.signature.clone())),
+                        ("amount".to_string(), BoltType::from(l.amount)),
+                        (
+                            "unlock_ts".to_string(),
+                            BoltType::from(l.unlock_timestamp as i64),
+                        ),
+                        ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                    ])
+                })
+            })
+            .collect();
+        if params.is_empty() {
+            return Ok(());
+        }
+        // --- THE CRITICAL FIX ---
+        let cypher = "
+            UNWIND $x as t
+            MERGE (s:Wallet {address: t.sender})
+            MERGE (k:Token {address: t.mint})
+            MERGE (s)-[r:LOCKED_SUPPLY {signature: t.signature}]->(k)
+            ON CREATE SET r.amount = t.amount, r.unlock_timestamp = t.unlock_ts, r.recipient = t.recipient, r.timestamp = t.timestamp
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_burned_links(&self, links: &[BurnedLink], burns: &[BurnRow]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let burn_map: HashMap<_, _> = burns.iter().map(|b| (b.signature.clone(), b)).collect();
+        let params: Vec<_> = links
+            .iter()
+            .filter_map(|l| {
+                burn_map.get(&l.signature).map(|burn_row| {
+                    HashMap::from([
+                        (
+                            "wallet".to_string(),
+                            BoltType::from(burn_row.source.clone()),
+                        ),
+                        (
+                            "token".to_string(),
+                            BoltType::from(burn_row.mint_address.clone()),
+                        ),
+                        ("signature".to_string(), BoltType::from(l.signature.clone())),
+                        ("amount".to_string(), BoltType::from(l.amount)),
+                        ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                    ])
+                })
+            })
+            .collect();
+        if params.is_empty() {
+            return Ok(());
+        }
+        // --- MODIFIED: MERGE on signature ---
+        let cypher = "
+            UNWIND $x as t
+            MATCH (w:Wallet {address: t.wallet}), (k:Token {address: t.token})
+            MERGE (w)-[r:BURNED {signature: t.signature}]->(k)
+            ON CREATE SET r.amount = t.amount, r.timestamp = t.timestamp
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_provided_liquidity_links(&self, links: &[ProvidedLiquidityLink]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let params: Vec<_> = links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("wallet".to_string(), BoltType::from(l.wallet.clone())),
+                    ("token".to_string(), BoltType::from(l.token.clone())),
+                    ("signature".to_string(), BoltType::from(l.signature.clone())),
+                    (
+                        "pool_address".to_string(),
+                        BoltType::from(l.pool_address.clone()),
+                    ),
+                    ("amount_base".to_string(), BoltType::from(l.amount_base)),
+                    ("amount_quote".to_string(), BoltType::from(l.amount_quote)),
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                ])
+            })
+            .collect();
+        // --- MODIFIED: MERGE on signature ---
+        let cypher = "
+            UNWIND $x as t
+            MERGE (w:Wallet {address: t.wallet})
+            MERGE (k:Token {address: t.token})
+            MERGE (w)-[r:PROVIDED_LIQUIDITY {signature: t.signature}]->(k)
+            ON CREATE SET r.pool_address = t.pool_address, r.amount_base = t.amount_base, r.amount_quote = t.amount_quote, r.timestamp = t.timestamp
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_top_trader_of_links(&self, links: &[TopTraderOfLink]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let params: Vec<_> = links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("wallet".to_string(), BoltType::from(l.wallet.clone())),
+                    ("token".to_string(), BoltType::from(l.token.clone())),
+                    // Add new params
+                    (
+                        "pnl_at_creation".to_string(),
+                        BoltType::from(l.pnl_at_creation),
+                    ),
+                    (
+                        "ath_at_creation".to_string(),
+                        BoltType::from(l.ath_usd_at_creation),
+                    ),
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                ])
+            })
+            .collect();
+        // --- MODIFIED: The definitive Cypher query ---
+        let cypher = "
+            UNWIND $x as t
+            MERGE (w:Wallet {address: t.wallet})
+            MERGE (k:Token {address: t.token})
+            MERGE (w)-[r:TOP_TRADER_OF]->(k)
+            ON CREATE SET
+                r.pnl_at_creation = t.pnl_at_creation,
+                r.ath_usd_at_creation = t.ath_at_creation,
+                r.timestamp = t.timestamp
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn write_whale_of_links(&self, links: &[WhaleOfLink]) -> Result<()> {
+        if links.is_empty() {
+            return Ok(());
+        }
+        let params: Vec<_> = links
+            .iter()
+            .map(|l| {
+                HashMap::from([
+                    ("wallet".to_string(), BoltType::from(l.wallet.clone())),
+                    ("token".to_string(), BoltType::from(l.token.clone())),
+                    // Add new params
+                    (
+                        "pct_at_creation".to_string(),
+                        BoltType::from(l.holding_pct_at_creation),
+                    ),
+                    (
+                        "ath_at_creation".to_string(),
+                        BoltType::from(l.ath_usd_at_creation),
+                    ),
+                    ("timestamp".to_string(), BoltType::from(l.timestamp)),
+                ])
+            })
+            .collect();
+        // --- MODIFIED: The definitive Cypher query ---
+        let cypher = "
+            UNWIND $x as t
+            MERGE (w:Wallet {address: t.wallet})
+            MERGE (k:Token {address: t.token})
+            MERGE (w)-[r:WHALE_OF]->(k)
+            ON CREATE SET
+                r.holding_pct_at_creation = t.pct_at_creation,
+                r.ath_usd_at_creation = t.ath_at_creation,
+                r.timestamp = t.timestamp
+            ON MATCH SET r.timestamp = CASE WHEN t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END
+        ";
+        self.enqueue_write(cypher, params).await
+    }
+    async fn fetch_latest_ath_map_with_retry(
+        &self,
+        token_addresses: &[String],
+    ) -> Result<HashMap<String, f64>> {
+        let mut ath_map = HashMap::new();
+        if token_addresses.is_empty() {
+            return Ok(ath_map);
+        }
+        let cfg = &*LINK_GRAPH_CONFIG;
+        #[derive(Row, Deserialize, Debug)]
+        struct AthInfo {
+            token_address: String,
+            ath_price_usd: f64,
+        }
+        let query = "
+            SELECT token_address, ath_price_usd
+            FROM token_metrics_latest
+            WHERE token_address IN ?
+            ORDER BY token_address, updated_at DESC
+            LIMIT 1 BY token_address
+        ";
+        for chunk in token_addresses.chunks(cfg.ath_fetch_chunk_size.max(1)) {
+            let mut attempts = 0;
+            loop {
+                attempts += 1;
+                let result: Result<Vec<AthInfo>> = self
+                    .db_client
+                    .query(query)
+                    .bind(chunk)
+                    .fetch_all()
+                    .await
+                    .map_err(|e| anyhow!("[LinkGraph] ATH fetch failed: {}", e));
+                match result {
+                    Ok(mut chunk_rows) => {
+                        for row in chunk_rows.drain(..) {
+                            ath_map.insert(row.token_address, row.ath_price_usd);
+                        }
+                        break;
+                    }
+                    Err(e) => {
+                        if attempts >= cfg.ch_retry_attempts {
+                            eprintln!(
+                                "[LinkGraph] 🔴 ATH fetch failed after {} attempts: {}",
+                                attempts, e
+                            );
+                            std::process::exit(1);
+                        }
+                        let backoff = cfg.ch_retry_backoff_ms * attempts as u64;
+                        eprintln!(
+                            "[LinkGraph] ⚠️ ATH fetch retry {}/{} after {}ms: {}",
+                            attempts, cfg.ch_retry_attempts, backoff, e
+                        );
+                        sleep(Duration::from_millis(backoff)).await;
+                    }
+                }
+            }
+        }
+        Ok(ath_map)
+    }
+    async fn fetch_pnl(&self, wallet_address: &str, mint_address: &str) -> Result<f64> {
+        let q_str = format!(
+            "SELECT realized_profit_pnl FROM wallet_holdings_latest WHERE wallet_address = '{}' AND mint_address = '{}'",
+            wallet_address, mint_address
+        );
+        // Fetch the pre-calculated f32 value
+        let pnl_f32 = self
+            .with_ch_retry(
+                || async {
+                    self.db_client
+                        .query(&q_str)
+                        .fetch_one::<f32>()
+                        .await
+                        .map_err(anyhow::Error::from)
+                },
+                "Fetch PNL",
+            )
+            .await?;
+        // Cast to f64 for the return type
+        Ok(pnl_f32 as f64)
+    }
+}

log.log ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:47b5b03f090da19eba850d54ea4cab1a97ebfdb7712ef4842cfc43804ec411b8
+size 10517118

models/HoldersEncoder.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+from typing import List, Dict, Any
+class HolderDistributionEncoder(nn.Module):
+    """
+    Encodes a list of top holders (wallet embeddings + holding percentages)
+    into a single fixed-size embedding representing the holder distribution.
+    It uses a Transformer Encoder to capture patterns and relationships.
+    """
+    def __init__(self,
+                 wallet_embedding_dim: int,
+                 output_dim: int,
+                 nhead: int = 4,
+                 num_layers: int = 2,
+                 dtype: torch.dtype = torch.float16):
+        super().__init__()
+        self.wallet_embedding_dim = wallet_embedding_dim
+        self.output_dim = output_dim
+        self.dtype = dtype
+        # 1. MLP to project holding percentage to the wallet embedding dimension
+        self.pct_proj = nn.Sequential(
+            nn.Linear(1, wallet_embedding_dim // 4),
+            nn.GELU(),
+            nn.Linear(wallet_embedding_dim // 4, wallet_embedding_dim)
+        ).to(dtype)
+        # 2. Transformer Encoder to process the sequence of holders
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=wallet_embedding_dim,
+            nhead=nhead,
+            dim_feedforward=wallet_embedding_dim * 4,
+            dropout=0.1,
+            activation='gelu',
+            batch_first=True,
+            dtype=dtype
+        )
+        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        # 3. A learnable [CLS] token to aggregate the sequence information
+        self.cls_token = nn.Parameter(torch.randn(1, 1, wallet_embedding_dim, dtype=dtype))
+        # 4. Final projection layer to get the desired output dimension
+        self.final_proj = nn.Linear(wallet_embedding_dim, output_dim).to(dtype)
+    def forward(self, holder_data: List[Dict[str, Any]]) -> torch.Tensor:
+        """
+        Args:
+            holder_data: A list of dictionaries, where each dict contains:
+                         'wallet_embedding': A tensor of shape [wallet_embedding_dim]
+                         'pct': The holding percentage as a float.
+        Returns:
+            A tensor of shape [1, output_dim] representing the entire distribution.
+        """
+        if not holder_data:
+            # Return a zero tensor if there are no holders
+            return torch.zeros(1, self.output_dim, device=self.cls_token.device, dtype=self.dtype)
+        # Prepare inputs for the transformer
+        wallet_embeds = torch.stack([d['wallet_embedding'] for d in holder_data])
+        holder_pcts = torch.tensor([[d['pct']] for d in holder_data], device=wallet_embeds.device, dtype=self.dtype)
+        # Project percentages and add to wallet embeddings to create holder features
+        pct_embeds = self.pct_proj(holder_pcts)
+        holder_inputs = (wallet_embeds + pct_embeds).unsqueeze(0) # Add batch dimension
+        # Prepend the [CLS] token
+        batch_size = holder_inputs.size(0)
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        transformer_input = torch.cat((cls_tokens, holder_inputs), dim=1)
+        # Pass through the transformer
+        transformer_output = self.transformer_encoder(transformer_input)
+        # Get the embedding of the [CLS] token (the first token)
+        cls_embedding = transformer_output[:, 0, :]
+        # Project to the final output dimension
+        return self.final_proj(cls_embedding)

models/SocialEncoders.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Dict, Any
+import models.vocabulary as vocab # For event type IDs
+class XPostEncoder(nn.Module):
+    """ Encodes: <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: Wallet (d_model) + Text (d_model) + Media (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 3, d_model * 2),
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model)
+        ).to(dtype)
+    def forward(self, author_emb: torch.Tensor, text_emb: torch.Tensor, media_emb: torch.Tensor) -> torch.Tensor:
+        combined = torch.cat([author_emb, text_emb, media_emb], dim=-1)
+        return self.mlp(combined)
+class XRetweetEncoder(nn.Module):
+    """ Encodes: <RetweeterWalletEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: Retweeter (d_model) + Original Author (d_model) + Original Text (d_model) + Original Media (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 4, d_model * 2),
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model)
+        ).to(dtype)
+    def forward(self,
+                retweeter_emb: torch.Tensor,
+                orig_author_emb: torch.Tensor,
+                orig_text_emb: torch.Tensor,
+                orig_media_emb: torch.Tensor) -> torch.Tensor:
+        combined = torch.cat([retweeter_emb, orig_author_emb, orig_text_emb, orig_media_emb], dim=-1)
+        return self.mlp(combined)
+class XReplyEncoder(nn.Module):
+    """ Encodes: <AuthorWalletEmbedding>, <PostTextEmbedding>, <MediaEmbedding>, <MainTweetEmbedding> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: Author (d_model) + Reply Text (d_model) + Reply Media (d_model) + Main Tweet Text (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 4, d_model * 2),
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model)
+        ).to(dtype)
+    def forward(self,
+                author_emb: torch.Tensor,
+                text_emb: torch.Tensor,
+                media_emb: torch.Tensor,
+                main_tweet_emb: torch.Tensor) -> torch.Tensor:
+        combined = torch.cat([author_emb, text_emb, media_emb, main_tweet_emb], dim=-1)
+        return self.mlp(combined)
+class XQuoteTweetEncoder(nn.Module):
+    """ Encodes: <QuoterWalletEmbedding>, <QuoterTextEmbedding>, <OriginalAuthorWalletEmbedding>, <OriginalPostTextEmbedding>, <OriginalPostMediaEmbedding> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: Quoter Wallet (d_model) + Quoter Text (d_model) + Orig Author (d_model) + Orig Text (d_model) + Orig Media (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 5, d_model * 2),
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model)
+        ).to(dtype)
+    def forward(self,
+                quoter_wallet_emb: torch.Tensor,
+                quoter_text_emb: torch.Tensor,
+                orig_author_emb: torch.Tensor,
+                orig_text_emb: torch.Tensor,
+                orig_media_emb: torch.Tensor) -> torch.Tensor:
+        combined = torch.cat([quoter_wallet_emb, quoter_text_emb, orig_author_emb, orig_text_emb, orig_media_emb], dim=-1)
+        return self.mlp(combined)
+class PumpReplyEncoder(nn.Module):
+    """ Encodes: <UserWalletEmbedding>, <ReplyTextEmbedding> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: User Wallet (d_model) + Reply Text (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 2, d_model * 2),
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model)
+        ).to(dtype)
+    def forward(self, user_emb: torch.Tensor, text_emb: torch.Tensor) -> torch.Tensor:
+        combined = torch.cat([user_emb, text_emb], dim=-1)
+        return self.mlp(combined)
+# --- NEW: Encoders for other text-based events ---
+class DexProfileUpdatedEncoder(nn.Module):
+    """ Encodes: <4_flags_projection>, <website_emb>, <twitter_emb>, <telegram_emb>, <description_emb> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: flags_proj (d_model) + 4x text_embeds (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 4, d_model * 2), # Corrected from 5 to 4, flags are separate
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model)
+        ).to(dtype)
+    def forward(self, website_emb: torch.Tensor, twitter_emb: torch.Tensor, telegram_emb: torch.Tensor, description_emb: torch.Tensor) -> torch.Tensor:
+        combined = torch.cat([website_emb, twitter_emb, telegram_emb, description_emb], dim=-1)
+        return self.mlp(combined)
+class GlobalTrendingEncoder(nn.Module):
+    """ Encodes: <hashtag_emb> """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        # Input: hashtag_emb (d_model)
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model, d_model),
+            nn.GELU(),
+            nn.Linear(d_model, d_model)
+        ).to(dtype)
+    def forward(self, hashtag_emb: torch.Tensor) -> torch.Tensor:
+        return self.mlp(hashtag_emb)
+class SocialEncoder(nn.Module):
+    """
+    A single module to house all social event encoders.
+    This simplifies instantiation in the main Oracle model.
+    """
+    def __init__(self, d_model: int, dtype: torch.dtype):
+        super().__init__()
+        self.x_post_encoder = XPostEncoder(d_model, dtype)
+        self.x_retweet_encoder = XRetweetEncoder(d_model, dtype)
+        self.x_reply_encoder = XReplyEncoder(d_model, dtype)
+        self.x_quote_tweet_encoder = XQuoteTweetEncoder(d_model, dtype)
+        self.pump_reply_encoder = PumpReplyEncoder(d_model, dtype)
+        # --- NEW: Add the other text-based encoders ---
+        self.dex_profile_encoder = DexProfileUpdatedEncoder(d_model, dtype)
+        self.global_trending_encoder = GlobalTrendingEncoder(d_model, dtype)
+        # Store for convenience
+        self.d_model = d_model
+        self.dtype = dtype
+    def forward(self, batch: Dict[str, Any], gathered_embeds: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        REFACTORED: Processes all text-based events for the entire batch in a vectorized way.
+        This replaces the inefficient loops in the main Oracle model.
+        """
+        device = gathered_embeds['wallet'].device
+        B, L, D = gathered_embeds['wallet'].shape
+        final_embeds = torch.zeros(B, L, D, device=device, dtype=self.dtype)
+        textual_event_indices = batch['textual_event_indices']
+        textual_event_data = batch.get('textual_event_data', [])
+        precomputed_lookup = gathered_embeds['precomputed']
+        zero_emb = torch.zeros(self.d_model, device=device, dtype=self.dtype)
+        # --- Create masks for each event type ---
+        event_type_ids = batch['event_type_ids']
+        event_masks = {
+            'XPost': (event_type_ids == vocab.EVENT_TO_ID.get('XPost', -1)),
+            'XReply': (event_type_ids == vocab.EVENT_TO_ID.get('XReply', -1)),
+            'XRetweet': (event_type_ids == vocab.EVENT_TO_ID.get('XRetweet', -1)),
+            'XQuoteTweet': (event_type_ids == vocab.EVENT_TO_ID.get('XQuoteTweet', -1)),
+            'PumpReply': (event_type_ids == vocab.EVENT_TO_ID.get('PumpReply', -1)),
+            'DexProfile_Updated': (event_type_ids == vocab.EVENT_TO_ID.get('DexProfile_Updated', -1)),
+            'TikTok_Trending_Hashtag': (event_type_ids == vocab.EVENT_TO_ID.get('TikTok_Trending_Hashtag', -1)),
+            'XTrending_Hashtag': (event_type_ids == vocab.EVENT_TO_ID.get('XTrending_Hashtag', -1)),
+        }
+        # --- Gather all necessary pre-computed embeddings in one go ---
+        # Flatten indices for efficient lookup, then reshape
+        flat_indices = textual_event_indices.flatten()
+        # Create a default event structure for padding indices (idx=0)
+        default_event = {'event_type': 'PAD'}
+        # Use 1-based index from collator, so textual_event_data[idx-1]
+        raw_events_flat = [textual_event_data[idx-1] if idx > 0 else default_event for idx in flat_indices.tolist()]
+        # Helper to gather embeddings for a specific key
+        def gather_precomputed(key: str) -> torch.Tensor:
+            indices = torch.tensor([e.get(key, 0) for e in raw_events_flat], device=device, dtype=torch.long)
+            return F.embedding(indices, precomputed_lookup).view(B, L, -1)
+        # --- Process each event type ---
+        # XPost
+        if event_masks['XPost'].any():
+            text_emb = gather_precomputed('text_emb_idx')
+            media_emb = gather_precomputed('media_emb_idx')
+            post_embeds = self.x_post_encoder(gathered_embeds['wallet'], text_emb, media_emb)
+            final_embeds += post_embeds * event_masks['XPost'].unsqueeze(-1)
+        # XReply
+        if event_masks['XReply'].any():
+            text_emb = gather_precomputed('text_emb_idx')
+            media_emb = gather_precomputed('media_emb_idx')
+            main_tweet_emb = gather_precomputed('main_tweet_text_emb_idx')
+            reply_embeds = self.x_reply_encoder(gathered_embeds['wallet'], text_emb, media_emb, main_tweet_emb)
+            final_embeds += reply_embeds * event_masks['XReply'].unsqueeze(-1)
+        # XRetweet
+        if event_masks['XRetweet'].any():
+            orig_text_emb = gather_precomputed('original_post_text_emb_idx')
+            orig_media_emb = gather_precomputed('original_post_media_emb_idx')
+            retweet_embeds = self.x_retweet_encoder(gathered_embeds['wallet'], gathered_embeds['original_author'], orig_text_emb, orig_media_emb)
+            final_embeds += retweet_embeds * event_masks['XRetweet'].unsqueeze(-1)
+        # XQuoteTweet
+        if event_masks['XQuoteTweet'].any():
+            quoter_text_emb = gather_precomputed('quoter_text_emb_idx')
+            orig_text_emb = gather_precomputed('original_post_text_emb_idx')
+            orig_media_emb = gather_precomputed('original_post_media_emb_idx')
+            quote_embeds = self.x_quote_tweet_encoder(gathered_embeds['wallet'], quoter_text_emb, gathered_embeds['original_author'], orig_text_emb, orig_media_emb)
+            final_embeds += quote_embeds * event_masks['XQuoteTweet'].unsqueeze(-1)
+        # PumpReply
+        if event_masks['PumpReply'].any():
+            text_emb = gather_precomputed('reply_text_emb_idx')
+            pump_reply_embeds = self.pump_reply_encoder(gathered_embeds['wallet'], text_emb)
+            final_embeds += pump_reply_embeds * event_masks['PumpReply'].unsqueeze(-1)
+        # DexProfile_Updated
+        if event_masks['DexProfile_Updated'].any():
+            website_emb = gather_precomputed('website_emb_idx')
+            twitter_emb = gather_precomputed('twitter_link_emb_idx')
+            telegram_emb = gather_precomputed('telegram_link_emb_idx')
+            description_emb = gather_precomputed('description_emb_idx')
+            profile_embeds = self.dex_profile_encoder(website_emb, twitter_emb, telegram_emb, description_emb)
+            # Note: The flags are handled separately in the main model now, so we just add the text embeds
+            final_embeds += profile_embeds * event_masks['DexProfile_Updated'].unsqueeze(-1)
+        # Global Trending Hashtags
+        trending_mask = event_masks['TikTok_Trending_Hashtag'] | event_masks['XTrending_Hashtag']
+        if trending_mask.any():
+            hashtag_emb = gather_precomputed('hashtag_name_emb_idx')
+            trending_embeds = self.global_trending_encoder(hashtag_emb)
+            final_embeds += trending_embeds * trending_mask.unsqueeze(-1)
+        return final_embeds

models/__init__.py ADDED Viewed

File without changes

models/graph_updater.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import torch
+import torch.nn as nn
+# We still use GATv2Conv, just not the to_hetero wrapper
+from torch_geometric.nn import GATv2Conv
+from torch_geometric.data import HeteroData
+from typing import Dict, List, Any
+from collections import defaultdict # For easy aggregation
+from PIL import Image
+from models.helper_encoders import ContextualTimeEncoder # Type hint for constructor compatibility
+# Import the actual ID_TO_LINK_TYPE mapping
+from models.vocabulary import ID_TO_LINK_TYPE
+# Import other modules needed for the test block
+import models.vocabulary
+from models.wallet_encoder import WalletEncoder
+from models.token_encoder import TokenEncoder
+from models.multi_modal_processor import MultiModalEncoder
+class _TransferLinkEncoder(nn.Module):
+    """Encodes: transfer amount only (timestamps removed)."""
+    def __init__(self, out_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        amounts = torch.tensor([[l.get('amount', 0.0)] for l in links], device=device, dtype=self.dtype)
+        features = self._safe_signed_log(amounts)
+        return self.proj(features)
+class _BundleTradeLinkEncoder(nn.Module):
+    """Encodes: total_amount across bundle (timestamps removed)."""
+    def __init__(self, out_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        totals = torch.tensor([[l.get('total_amount', 0.0)] for l in links], device=device, dtype=self.dtype)
+        total_embeds = self._safe_signed_log(totals)
+        return self.proj(total_embeds)
+class _CopiedTradeLinkEncoder(nn.Module):
+    """ Encodes: 10 numerical features """
+    def __init__(self, in_features: int, out_dim: int, dtype: torch.dtype): # Added dtype
+        super().__init__()
+        self.in_features = in_features
+        self.norm = nn.LayerNorm(in_features)
+        self.mlp = nn.Sequential(
+            nn.Linear(in_features, out_dim * 2), nn.GELU(),
+            nn.Linear(out_dim * 2, out_dim)
+        )
+        self.dtype = dtype # Store dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        num_data = []
+        for l in links:
+            # --- FIXED: Only use the 6 essential features ---
+            num_data.append([
+                l.get('time_gap_on_buy_sec', 0), l.get('time_gap_on_sell_sec', 0),
+                l.get('leader_pnl', 0), l.get('follower_pnl', 0),
+                l.get('follower_buy_total', 0), l.get('follower_sell_total', 0)
+            ])
+        # Create tensor with correct dtype
+        x = torch.tensor(num_data, device=device, dtype=self.dtype)
+        # Input to norm must match norm's dtype
+        x_norm = self.norm(self._safe_signed_log(x))
+        return self.mlp(x_norm)
+class _CoordinatedActivityLinkEncoder(nn.Module):
+    """ Encodes: 2 numerical features """
+    def __init__(self, in_features: int, out_dim: int, dtype: torch.dtype): # Added dtype
+        super().__init__()
+        self.in_features = in_features
+        self.norm = nn.LayerNorm(in_features)
+        self.mlp = nn.Sequential(
+            nn.Linear(in_features, out_dim), nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype # Store dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        num_data = []
+        for l in links:
+            num_data.append([
+                l.get('time_gap_on_first_sec', 0), l.get('time_gap_on_second_sec', 0)
+            ])
+        # Create tensor with correct dtype
+        x = torch.tensor(num_data, device=device, dtype=self.dtype)
+        x_norm = self.norm(self._safe_signed_log(x))
+        return self.mlp(x_norm)
+class _MintedLinkEncoder(nn.Module):
+    """Encodes: buy_amount only (timestamps removed)."""
+    def __init__(self, out_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype # Store dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        nums = torch.tensor([[l['buy_amount']] for l in links], device=device, dtype=self.dtype)
+        num_embeds = self._safe_signed_log(nums)
+        return self.proj(num_embeds)
+class _SnipedLinkEncoder(nn.Module):
+    """ Encodes: rank, sniped_amount """
+    def __init__(self, in_features: int, out_dim: int, dtype: torch.dtype): # Added dtype
+        super().__init__()
+        self.norm = nn.LayerNorm(in_features)
+        self.mlp = nn.Sequential(nn.Linear(in_features, out_dim), nn.GELU(), nn.Linear(out_dim, out_dim))
+        self.dtype = dtype # Store dtype
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        num_data = [[l.get('rank', 0), l.get('sniped_amount', 0)] for l in links]
+        # Create tensor with correct dtype
+        x = torch.tensor(num_data, device=device, dtype=self.dtype)
+        # --- FIXED: Selectively log-scale features ---
+        # Invert rank so 1 is highest, treat as linear. Log-scale sniped_amount.
+        x[:, 0] = 1.0 / torch.clamp(x[:, 0], min=1.0) # Invert rank, clamp to avoid division by zero
+        x[:, 1] = torch.sign(x[:, 1]) * torch.log1p(torch.abs(x[:, 1])) # Log-scale amount
+        x_norm = self.norm(x)
+        return self.mlp(x_norm)
+class _LockedSupplyLinkEncoder(nn.Module):
+    """ Encodes: amount """
+    def __init__(self, out_dim: int, dtype: torch.dtype): # Removed time_encoder
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype # Store dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        nums = torch.tensor([[l['amount']] for l in links], device=device, dtype=self.dtype)
+        num_embeds = self._safe_signed_log(nums)
+        return self.proj(num_embeds)
+class _BurnedLinkEncoder(nn.Module):
+    """Encodes: burned amount (timestamps removed)."""
+    def __init__(self, out_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        amounts = torch.tensor([[l.get('amount', 0.0)] for l in links], device=device, dtype=self.dtype)
+        amount_embeds = self._safe_signed_log(amounts)
+        return self.proj(amount_embeds)
+class _ProvidedLiquidityLinkEncoder(nn.Module):
+    """Encodes: quote amount (timestamps removed)."""
+    def __init__(self, out_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.proj = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        quote_amounts = torch.tensor([[l.get('amount_quote', 0.0)] for l in links], device=device, dtype=self.dtype)
+        quote_embeds = self._safe_signed_log(quote_amounts)
+        return self.proj(quote_embeds)
+class _WhaleOfLinkEncoder(nn.Module):
+    """ Encodes: holding_pct_at_creation """
+    def __init__(self, out_dim: int, dtype: torch.dtype):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(1, out_dim),
+            nn.GELU(),
+            nn.Linear(out_dim, out_dim)
+        )
+        self.dtype = dtype
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        vals = torch.tensor([[l.get('holding_pct_at_creation', 0.0)] for l in links], device=device, dtype=self.dtype)
+        vals_log = torch.sign(vals) * torch.log1p(torch.abs(vals))
+        return self.mlp(vals_log)
+class _TopTraderOfLinkEncoder(nn.Module):
+    """ Encodes: pnl_at_creation """
+    def __init__(self, out_dim: int, dtype: torch.dtype): # Removed in_features
+        super().__init__()
+        self.mlp = nn.Sequential(nn.Linear(1, out_dim), nn.GELU(), nn.Linear(out_dim, out_dim))
+        self.dtype = dtype
+    def forward(self, links: List[Dict[str, Any]], device) -> torch.Tensor:
+        num_data = [[l.get('pnl_at_creation', 0)] for l in links]
+        x = torch.tensor(num_data, device=device, dtype=self.dtype)
+        log_scaled_x = torch.sign(x) * torch.log1p(torch.abs(x))
+        return self.mlp(log_scaled_x)
+class RelationalGATBlock(nn.Module):
+    """
+    Shared GATv2Conv that remains relation-aware by concatenating a learned
+    relation embedding to every edge attribute before message passing.
+    """
+    def __init__(
+        self,
+        node_dim: int,
+        edge_attr_dim: int,
+        n_heads: int,
+        relations: List[str],
+        dtype: torch.dtype,
+    ):
+        super().__init__()
+        self.rel_to_id = {name: idx for idx, name in enumerate(relations)}
+        self.edge_attr_dim = edge_attr_dim
+        self.rel_emb = nn.Embedding(len(relations), edge_attr_dim)
+        self.conv = GATv2Conv(
+            in_channels=node_dim,
+            out_channels=node_dim,
+            heads=n_heads,
+            concat=False,
+            dropout=0.1,
+            add_self_loops=False,
+            edge_dim=edge_attr_dim * 2, # concat of edge attr + relation emb
+        ).to(dtype)
+    def forward(
+        self,
+        x_src: torch.Tensor,
+        x_dst: torch.Tensor,
+        edge_index: torch.Tensor,
+        edge_attr: torch.Tensor,
+        rel_type: str,
+    ) -> torch.Tensor:
+        num_edges = edge_index.size(1)
+        device = edge_index.device
+        if edge_attr is None:
+            edge_attr = torch.zeros(
+                num_edges,
+                self.edge_attr_dim,
+                device=device,
+                dtype=x_src.dtype,
+            )
+        rel_id = self.rel_to_id.get(rel_type)
+        if rel_id is None:
+            raise KeyError(f"Relation '{rel_type}' not registered in RelationalGATBlock.")
+        rel_feat = self.rel_emb.weight[rel_id].to(edge_attr.dtype)
+        rel_feat = rel_feat.expand(num_edges, -1)
+        augmented_attr = torch.cat([edge_attr, rel_feat], dim=-1)
+        return self.conv((x_src, x_dst), edge_index, edge_attr=augmented_attr)
+# =============================================================================
+# 2. The Main GraphUpdater (GNN) - MANUAL HETEROGENEOUS IMPLEMENTATION
+# =============================================================================
+class GraphUpdater(nn.Module):
+    """
+    FIXED: Manually implements Heterogeneous GNN logic using separate GATv2Conv
+    layers for each edge type, bypassing the problematic `to_hetero` wrapper.
+    """
+    def __init__(self,time_encoder: ContextualTimeEncoder, edge_attr_dim: int = 64,
+                 n_heads: int = 4, num_layers: int = 2, node_dim: int = 2048, dtype: torch.dtype = torch.float16):
+        super().__init__()
+        self.node_dim = node_dim
+        self.edge_attr_dim = edge_attr_dim
+        self.num_layers = num_layers
+        self.dtype = dtype
+        # --- Instantiate all 11 Link Feature Encoders --- (Unchanged)
+        self.edge_encoders = nn.ModuleDict({
+            'TransferLink': _TransferLinkEncoder(edge_attr_dim, dtype=dtype),
+            'TransferLinkToken': _TransferLinkEncoder(edge_attr_dim, dtype=dtype),
+            'BundleTradeLink': _BundleTradeLinkEncoder(edge_attr_dim, dtype=dtype),
+            'CopiedTradeLink': _CopiedTradeLinkEncoder(6, edge_attr_dim, dtype=dtype), # FIXED: in_features=6
+            'CoordinatedActivityLink': _CoordinatedActivityLinkEncoder(2, edge_attr_dim, dtype=dtype),
+            'MintedLink': _MintedLinkEncoder(edge_attr_dim, dtype=dtype),
+            'SnipedLink': _SnipedLinkEncoder(2, edge_attr_dim, dtype=dtype),
+            'LockedSupplyLink': _LockedSupplyLinkEncoder(edge_attr_dim, dtype=dtype), # FIXED: No time_encoder
+            'BurnedLink': _BurnedLinkEncoder(edge_attr_dim, dtype=dtype),
+            'ProvidedLiquidityLink': _ProvidedLiquidityLinkEncoder(edge_attr_dim, dtype=dtype),
+            'WhaleOfLink': _WhaleOfLinkEncoder(edge_attr_dim, dtype=dtype), # FIXED: No in_features
+            'TopTraderOfLink': _TopTraderOfLinkEncoder(edge_attr_dim, dtype=dtype), # FIXED: No in_features
+        }).to(dtype)
+        # --- Define shared relational GNN blocks per meta edge direction ---
+        self.edge_groups = self._build_edge_groups()
+        self.conv_layers = nn.ModuleList()
+        for _ in range(num_layers):
+            conv_dict = nn.ModuleDict()
+            for (src_type, dst_type), relations in self.edge_groups.items():
+                conv_dict[f"{src_type}__{dst_type}"] = RelationalGATBlock(
+                    node_dim=node_dim,
+                    edge_attr_dim=edge_attr_dim,
+                    n_heads=n_heads,
+                    relations=relations,
+                    dtype=dtype,
+                )
+            self.conv_layers.append(conv_dict)
+        self.norm = nn.LayerNorm(node_dim)
+        self.to(dtype) # Move norm layer and ModuleList container
+    def _build_edge_groups(self) -> Dict[tuple, List[str]]:
+        """Group relations by (src_type, dst_type) so conv weights can be shared."""
+        groups: Dict[tuple, List[str]] = defaultdict(list)
+        wallet_wallet_links = ['TransferLink', 'BundleTradeLink', 'CopiedTradeLink', 'CoordinatedActivityLink']
+        wallet_token_links = [
+            'TransferLinkToken', 'MintedLink', 'SnipedLink', 'LockedSupplyLink',
+            'BurnedLink', 'ProvidedLiquidityLink', 'WhaleOfLink', 'TopTraderOfLink'
+        ]
+        for link in wallet_wallet_links:
+            groups[('wallet', 'wallet')].append(link)
+            groups[('wallet', 'wallet')].append(f"rev_{link}")
+        for link in wallet_token_links:
+            groups[('wallet', 'token')].append(link)
+            groups[('token', 'wallet')].append(f"rev_{link}")
+        return groups
+    def forward(
+        self,
+        x_dict: Dict[str, torch.Tensor],
+        edge_data_dict: Dict[str, Dict[str, Any]]
+    ) -> Dict[str, torch.Tensor]:
+        device = x_dict['wallet'].device
+        # --- 1. Encode Edge Attributes ---
+        edge_index_dict = {}
+        edge_attr_dict = {}
+        for link_name, data in edge_data_dict.items():
+            edge_index = data.get('edge_index')
+            links = data.get('links', [])
+            # Check if edge_index is valid before proceeding
+            if edge_index is None or edge_index.numel() == 0 or not links:
+                continue # Skip if no links or index of this type
+            edge_index = edge_index.to(device)
+            # Use vocabulary to get the triplet (src, rel, dst)
+            # Make sure ID_TO_LINK_TYPE is correctly populated
+            if link_name not in vocabulary.LINK_NAME_TO_TRIPLET:
+                print(f"Warning: Link name '{link_name}' not found in vocabulary.LINK_NAME_TO_TRIPLET. Skipping.")
+                continue
+            src_type, rel_type, dst_type = vocabulary.LINK_NAME_TO_TRIPLET[link_name]
+            # Check if encoder exists for this link name
+            if link_name not in self.edge_encoders:
+                 print(f"Warning: No edge encoder found for link type '{link_name}'. Skipping edge attributes.")
+                 edge_attr = None # Or handle differently if attributes are essential
+            else:
+                 edge_attr = self.edge_encoders[link_name](links, device).to(self.dtype)
+            # Forward link
+            fwd_key = (src_type, rel_type, dst_type)
+            edge_index_dict[fwd_key] = edge_index
+            if edge_attr is not None:
+                edge_attr_dict[fwd_key] = edge_attr
+            # Reverse link
+            # Ensure edge_index has the right shape for flipping
+            if edge_index.shape[0] == 2:
+                rev_edge_index = edge_index[[1, 0]]
+                rev_rel_type = f'rev_{rel_type}'
+                rev_key = (dst_type, rev_rel_type, src_type)
+                edge_index_dict[rev_key] = rev_edge_index
+                if edge_attr is not None:
+                    # Re-use same attributes for reverse edge
+                    edge_attr_dict[rev_key] = edge_attr
+            else:
+                 print(f"Warning: Edge index for {link_name} has unexpected shape {edge_index.shape}. Cannot create reverse edge.")
+        # --- 2. Run GNN Layers MANUALLY ---
+        x_out = x_dict
+        for i in range(self.num_layers):
+            # Initialize aggregation tensors for each node type that exists in the input
+            msg_aggregates = {
+                node_type: torch.zeros_like(x_node)
+                for node_type, x_node in x_out.items()
+            }
+            # --- Message Passing ---
+            for edge_type_tuple in edge_index_dict.keys(): # Iterate through edges PRESENT in the batch
+                src_type, rel_type, dst_type = edge_type_tuple
+                edge_index = edge_index_dict[edge_type_tuple]
+                edge_attr = edge_attr_dict.get(edge_type_tuple) # Use .get() in case attr is None
+                x_src = x_out.get(src_type)
+                x_dst = x_out.get(dst_type)
+                if x_src is None or x_dst is None:
+                    print(f"Warning: Missing node embeddings for types {src_type}->{dst_type}. Skipping.")
+                    continue
+                block_key = f"{src_type}__{dst_type}"
+                if block_key not in self.conv_layers[i]:
+                    print(f"Warning: Relational block for {block_key} not found in layer {i}. Skipping.")
+                    continue
+                block = self.conv_layers[i][block_key]
+                try:
+                    messages = block(x_src, x_dst, edge_index, edge_attr, rel_type)
+                except KeyError:
+                    print(f"Warning: Relation '{rel_type}' missing in block {block_key}. Skipping.")
+                    continue
+                # *** THE FIX ***
+                # Use scatter_add_ to accumulate messages for the destination node type.
+                # This correctly handles multiple edge types pointing to the same node type.
+                msg_aggregates[dst_type].scatter_add_(0, edge_index[1].unsqueeze(1).expand_as(messages), messages)
+            # --- Aggregation & Update (Residual Connection) ---
+            x_next = {}
+            for node_type, x_original in x_out.items():
+                # Check if messages were computed and stored correctly
+                 if node_type in msg_aggregates and msg_aggregates[node_type].shape[0] > 0:
+                    aggregated_msgs = msg_aggregates[node_type]
+                    # Ensure dimensions match before adding
+                    if x_original.shape == aggregated_msgs.shape:
+                        x_next[node_type] = self.norm(x_original + aggregated_msgs)
+                    else:
+                        print(f"Warning: Shape mismatch for node type {node_type} during update. Original: {x_original.shape}, Aggregated: {aggregated_msgs.shape}. Skipping residual connection.")
+                        x_next[node_type] = x_original # Fallback
+                 else:
+                    x_next[node_type] = x_original
+            x_out = x_next
+        return x_out

models/helper_encoders.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import torch.nn as nn
+import math
+import datetime
+from typing import Dict, List, Any, Optional
+class ContextualTimeEncoder(nn.Module):
+    def __init__(self, output_dim: int = 128, dtype: torch.dtype = torch.float32):
+        """
+        Encodes a Unix timestamp with support for mixed precision.
+        Args:
+            output_dim (int): The final dimension of the output embedding.
+            dtype (torch.dtype): The data type for the model's parameters (e.g., torch.float16).
+        """
+        super().__init__()
+        self.dtype = dtype
+        if output_dim < 12:
+            raise ValueError(f"output_dim must be at least 12, but got {output_dim}")
+        ts_dim = output_dim // 2
+        hour_dim = output_dim // 4
+        day_dim = output_dim - ts_dim - hour_dim
+        self.ts_dim = ts_dim + (ts_dim % 2)
+        self.hour_dim = hour_dim + (hour_dim % 2)
+        self.day_dim = day_dim + (day_dim % 2)
+        total_internal_dim = self.ts_dim + self.hour_dim + self.day_dim
+        self.projection = nn.Linear(total_internal_dim, output_dim)
+        # Cast the entire module to the specified dtype
+        self.to(dtype)
+    def _sinusoidal_encode(self, values: torch.Tensor, d_model: int) -> torch.Tensor:
+        device = values.device
+        half_dim = d_model // 2
+        # Calculations for sinusoidal encoding are more stable in float32
+        div_term = torch.exp(torch.arange(0, half_dim, device=device).float() * -(math.log(10000.0) / half_dim))
+        args = values.float().unsqueeze(-1) * div_term
+        return torch.cat([torch.sin(args), torch.cos(args)], dim=-1)
+    def _cyclical_encode(self, values: torch.Tensor, d_model: int, max_val: float) -> torch.Tensor:
+        device = values.device
+        norm_values = (values.float() / max_val) * 2 * math.pi
+        half_dim = d_model // 2
+        sin_args = norm_values.unsqueeze(-1).repeat(1, half_dim)
+        cos_args = norm_values.unsqueeze(-1).repeat(1, half_dim)
+        return torch.cat([torch.sin(sin_args), torch.cos(cos_args)], dim=-1)
+    def forward(self, timestamps: torch.Tensor) -> torch.Tensor:
+            device = self.projection.weight.device
+            # 1. Store original shape (e.g., [B, L]) and flatten
+            original_shape = timestamps.shape
+            timestamps_flat = timestamps.flatten().float() # Shape [N_total]
+            # 2. Sinusoidal encode (already vectorized)
+            ts_encoding = self._sinusoidal_encode(timestamps_flat, self.ts_dim)
+            # 3. List comprehension (this is the only non-vectorized part)
+            # This loop is now correct, as it iterates over the 1D flat tensor
+            hours = torch.tensor([datetime.datetime.fromtimestamp(ts.item(), tz=datetime.timezone.utc).hour for ts in timestamps_flat], device=device, dtype=torch.float32)
+            days = torch.tensor([datetime.datetime.fromtimestamp(ts.item(), tz=datetime.timezone.utc).weekday() for ts in timestamps_flat], device=device, dtype=torch.float32)
+            # 4. Cyclical encode (already vectorized)
+            hour_encoding = self._cyclical_encode(hours, self.hour_dim, max_val=24.0)
+            day_encoding = self._cyclical_encode(days, self.day_dim, max_val=7.0)
+            # 5. Combine and project
+            combined_encoding = torch.cat([ts_encoding, hour_encoding, day_encoding], dim=1)
+            projected = self.projection(combined_encoding.to(self.dtype)) # Shape [N_total, output_dim]
+            # 6. Reshape to match original (e.g., [B, L, output_dim])
+            output_shape = original_shape + (self.projection.out_features,)
+            return projected.view(output_shape)
+def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+    summed = torch.sum(last_hidden_state * mask, 1)
+    denom = torch.clamp(mask.sum(1), min=1e-9)
+    return summed / denom

models/model.py ADDED Viewed

	@@ -0,0 +1,1009 @@

+# model.py (REFACTORED AND FIXED)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModel
+from typing import List, Dict, Any, Optional, Tuple
+# --- NOW, we import all the encoders ---
+from models.helper_encoders import ContextualTimeEncoder
+from models.token_encoder import TokenEncoder
+from models.wallet_encoder import WalletEncoder
+from models.graph_updater import GraphUpdater
+from models.ohlc_embedder import OHLCEmbedder
+from models.HoldersEncoder import HolderDistributionEncoder # NEW
+from models.SocialEncoders import SocialEncoder # NEW
+import models.vocabulary as vocab # For vocab sizes
+class Oracle(nn.Module):
+    """
+    """
+    def __init__(self,
+                 token_encoder: TokenEncoder,
+                 wallet_encoder: WalletEncoder,
+                 graph_updater: GraphUpdater,
+                 ohlc_embedder: OHLCEmbedder, # NEW
+                 time_encoder: ContextualTimeEncoder,
+                 num_event_types: int,
+                 multi_modal_dim: int,
+                 event_pad_id: int,
+                 event_type_to_id: Dict[str, int],
+                 model_config_name: str = "Qwen/Qwen3-0.6B",
+                 quantiles: List[float] = [0.1, 0.5, 0.9],
+                 horizons_seconds: List[int] = [30, 60, 120, 240, 420],
+                 dtype: torch.dtype = torch.bfloat16):
+        super().__init__()
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = torch.device(device)
+        self.dtype = dtype
+        self.multi_modal_dim = multi_modal_dim
+        self.quantiles = quantiles
+        self.horizons_seconds = horizons_seconds
+        self.num_outputs = len(quantiles) * len(horizons_seconds)
+        self.dtype = dtype
+        # --- 2. Load Qwen3 Configuration (architecture only; training from scratch) ---
+        model_config = AutoConfig.from_pretrained(model_config_name, trust_remote_code=True)
+        self.d_model = model_config.hidden_size
+        self.model = AutoModel.from_config(model_config, trust_remote_code=True)
+        self.model.to(self.device, dtype=self.dtype)
+        # Quantile prediction head (maps pooled hidden state -> flattened horizon/quantile grid)
+        self.quantile_head = nn.Sequential(
+            nn.Linear(self.d_model, self.d_model),
+            nn.GELU(),
+            nn.Linear(self.d_model, self.num_outputs)
+        )
+        self.event_type_to_id = event_type_to_id
+        # --- 1. Store All Encoders ---
+        # Define Token Roles before using them
+        self.token_roles = {'main': 0, 'quote': 1, 'trending': 2} # Add trending for future use
+        self.main_token_role_id = self.token_roles['main']
+        self.quote_token_role_id = self.token_roles['quote']
+        self.trending_token_role_id = self.token_roles['trending']
+        self.token_encoder = token_encoder
+        self.wallet_encoder = wallet_encoder
+        self.graph_updater = graph_updater
+        self.ohlc_embedder = ohlc_embedder
+        self.time_encoder = time_encoder # Store time_encoder
+        self.social_encoder = SocialEncoder(d_model=self.d_model, dtype=self.dtype) # Now self.d_model is defined
+        # --- 4. Define Sequence Feature Embeddings ---
+        self.event_type_embedding = nn.Embedding(num_event_types, self.d_model, padding_idx=event_pad_id)
+        # --- NEW: Token Role Embeddings ---
+        self.token_role_embedding = nn.Embedding(len(self.token_roles), self.d_model)
+        # --- 5. Define Entity Padding (Learnable) ---
+        self.pad_wallet_emb = nn.Parameter(torch.zeros(1, self.wallet_encoder.d_model))
+        self.pad_token_emb = nn.Parameter(torch.zeros(1, self.token_encoder.output_dim))
+        self.pad_ohlc_emb = nn.Parameter(torch.zeros(1, self.ohlc_embedder.output_dim))
+        self.pad_precomputed_emb = nn.Parameter(torch.zeros(1, self.multi_modal_dim)) # NEW: For text/images
+        # --- NEW: Instantiate HolderDistributionEncoder internally ---
+        self.holder_dist_encoder = HolderDistributionEncoder(
+            wallet_embedding_dim=self.wallet_encoder.d_model,
+            output_dim=self.d_model,
+            dtype=self.dtype # Pass the correct dtype
+        )
+        self.pad_holder_snapshot_emb = nn.Parameter(torch.zeros(1, self.d_model)) # Output of holder_dist_encoder is d_model
+        # --- 6. Define Projection MLPs ---
+        self.time_proj = nn.Linear(self.time_encoder.projection.out_features, self.d_model)
+        self.rel_ts_proj = nn.Linear(1, self.d_model)
+        self.rel_ts_norm = nn.LayerNorm(1)
+        self.wallet_proj = nn.Linear(self.wallet_encoder.d_model, self.d_model)
+        self.token_proj = nn.Linear(self.token_encoder.output_dim, self.d_model)
+        self.ohlc_proj = nn.Linear(self.ohlc_embedder.output_dim, self.d_model)
+        # self.holder_snapshot_proj is no longer needed as HolderDistributionEncoder outputs directly to d_model
+        # --- NEW: Layers for Transfer Numerical Features ---
+        self.transfer_num_norm = nn.LayerNorm(4) # Normalize the 4 features
+        self.transfer_num_proj = nn.Linear(4, self.d_model) # Project to d_model
+        # --- NEW: Layers for Trade Numerical Features ---
+        # --- FIXED: Size reduced from 10 to 8 ---
+        self.trade_num_norm = nn.LayerNorm(8)
+        self.trade_num_proj = nn.Linear(8, self.d_model)
+        # --- NEW: Embedding for categorical dex_platform_id ---
+        self.dex_platform_embedding = nn.Embedding(vocab.NUM_DEX_PLATFORMS, self.d_model)
+        # --- NEW: Embedding for categorical trade_direction ---
+        self.trade_direction_embedding = nn.Embedding(2, self.d_model) # 0 for buy, 1 for sell
+        # --- FIXED: Embedding for categorical mev_protection is now binary ---
+        self.mev_protection_embedding = nn.Embedding(2, self.d_model) # 0 for false, 1 for true
+        # --- NEW: Embedding for categorical is_bundle ---
+        self.is_bundle_embedding = nn.Embedding(2, self.d_model) # 0 for false, 1 for true
+        # --- NEW: Separate Layers for Deployer Trade Numerical Features ---
+        # --- FIXED: Size reduced from 10 to 8 ---
+        self.deployer_trade_num_norm = nn.LayerNorm(8)
+        self.deployer_trade_num_proj = nn.Linear(8, self.d_model)
+        # --- NEW: Separate Layers for Smart Wallet Trade Numerical Features ---
+        # --- FIXED: Size reduced from 10 to 8 ---
+        self.smart_wallet_trade_num_norm = nn.LayerNorm(8)
+        self.smart_wallet_trade_num_proj = nn.Linear(8, self.d_model)
+        # --- NEW: Layers for PoolCreated Numerical Features ---
+        # --- FIXED: Size reduced from 5 to 4 ---
+        self.pool_created_num_norm = nn.LayerNorm(2)
+        self.pool_created_num_proj = nn.Linear(2, self.d_model)
+        # --- NEW: Layers for LiquidityChange Numerical Features ---
+        # --- FIXED: Size reduced from 3 to 2 ---
+        self.liquidity_change_num_norm = nn.LayerNorm(1)
+        self.liquidity_change_num_proj = nn.Linear(1, self.d_model)
+        # --- NEW: Embedding for categorical change_type_id ---
+        # --- FIXED: Hardcoded the number of types (add/remove) as per user instruction ---
+        self.liquidity_change_type_embedding = nn.Embedding(2, self.d_model)
+        # --- NEW: Layers for FeeCollected Numerical Features ---
+        self.fee_collected_num_norm = nn.LayerNorm(1) # sol_amount only
+        self.fee_collected_num_proj = nn.Linear(1, self.d_model)
+        # --- NEW: Layers for TokenBurn Numerical Features ---
+        self.token_burn_num_norm = nn.LayerNorm(2) # amount_pct, amount_tokens
+        self.token_burn_num_proj = nn.Linear(2, self.d_model)
+        # --- NEW: Layers for SupplyLock Numerical Features ---
+        self.supply_lock_num_norm = nn.LayerNorm(2) # amount_pct, lock_duration
+        self.supply_lock_num_proj = nn.Linear(2, self.d_model)
+        # --- NEW: Layers for OnChain_Snapshot Numerical Features ---
+        self.onchain_snapshot_num_norm = nn.LayerNorm(14)
+        self.onchain_snapshot_num_proj = nn.Linear(14, self.d_model)
+        # --- NEW: Layers for TrendingToken Numerical Features ---
+        # --- FIXED: Size reduced from 3 to 1 (rank only) ---
+        self.trending_token_num_norm = nn.LayerNorm(1)
+        self.trending_token_num_proj = nn.Linear(1, self.d_model)
+        # --- NEW: Embeddings for categorical IDs ---
+        self.trending_list_source_embedding = nn.Embedding(vocab.NUM_TRENDING_LIST_SOURCES, self.d_model)
+        self.trending_timeframe_embedding = nn.Embedding(vocab.NUM_TRENDING_LIST_TIMEFRAMES, self.d_model)
+        # --- NEW: Layers for BoostedToken Numerical Features ---
+        self.boosted_token_num_norm = nn.LayerNorm(2) # total_boost_amount, rank
+        self.boosted_token_num_proj = nn.Linear(2, self.d_model)
+        # --- NEW: Layers for DexBoost_Paid Numerical Features ---
+        self.dexboost_paid_num_norm = nn.LayerNorm(2) # amount, total_amount_on_token
+        self.dexboost_paid_num_proj = nn.Linear(2, self.d_model)
+        # --- NEW: Layers for DexProfile_Updated Features ---
+        self.dexprofile_updated_flags_proj = nn.Linear(4, self.d_model) # Project the 4 boolean flags
+        # --- NEW: Projection for all pre-computed embeddings (text/images) ---
+        self.precomputed_proj = nn.Linear(self.multi_modal_dim, self.d_model)
+        # --- NEW: Embedding for Protocol IDs (used in Migrated event) ---
+        self.protocol_embedding = nn.Embedding(vocab.NUM_PROTOCOLS, self.d_model)
+        # --- NEW: Embeddings for TrackerEncoder Events ---
+        # Note: NUM_CALL_CHANNELS might need to be large and managed as vocab grows.
+        self.alpha_group_embedding = nn.Embedding(vocab.NUM_ALPHA_GROUPS, self.d_model)
+        self.call_channel_embedding = nn.Embedding(vocab.NUM_CALL_CHANNELS, self.d_model)
+        self.cex_listing_embedding = nn.Embedding(vocab.NUM_EXCHANGES, self.d_model)
+        # --- NEW: Layers for GlobalTrendingEncoder Events ---
+        self.global_trending_num_norm = nn.LayerNorm(1) # rank
+        self.global_trending_num_proj = nn.Linear(1, self.d_model)
+        # --- NEW: Layers for ChainSnapshot Events ---
+        self.chainsnapshot_num_norm = nn.LayerNorm(2) # native_token_price_usd, gas_fee
+        self.chainsnapshot_num_proj = nn.Linear(2, self.d_model)
+        # --- NEW: Layers for Lighthouse_Snapshot Events ---
+        # --- FIXED: Size reduced from 7 to 5 ---
+        self.lighthousesnapshot_num_norm = nn.LayerNorm(5)
+        self.lighthousesnapshot_num_proj = nn.Linear(5, self.d_model)
+        # --- NEW: Embedding for timeframe ID (re-uses protocol_embedding) ---
+        self.lighthouse_timeframe_embedding = nn.Embedding(vocab.NUM_LIGHTHOUSE_TIMEFRAMES, self.d_model)
+        # --- NEW: Embeddings for Special Context Tokens ---
+        self.special_context_tokens = {'Middle': 0, 'RECENT': 1}
+        self.special_context_embedding = nn.Embedding(len(self.special_context_tokens), self.d_model)
+        # --- 7. Prediction Head --- (Unchanged)
+        # self.prediction_head = nn.Linear(self.d_model, self.num_outputs)
+        # --- 8. Move all new modules to correct dtype ---
+        self.to(dtype)
+        print("Oracle model (full pipeline) initialized.")
+    def _normalize_and_project(self,
+                               features: torch.Tensor,
+                               norm_layer: nn.LayerNorm,
+                               proj_layer: nn.Linear,
+                               log_indices: Optional[List[int]] = None) -> torch.Tensor:
+        """
+        A helper function to selectively apply log scaling, then normalize and project.
+        """
+        # Make a copy to avoid in-place modification issues
+        processed_features = features.clone()
+        # Apply log scaling only to specified indices
+        if log_indices:
+            # Ensure log_indices are valid
+            valid_indices = [i for i in log_indices if i < processed_features.shape[-1]]
+            if valid_indices:
+                log_features = processed_features[:, :, valid_indices].to(torch.float32)
+                log_scaled = torch.sign(log_features) * torch.log1p(torch.abs(log_features))
+                processed_features[:, :, valid_indices] = log_scaled.to(processed_features.dtype)
+        # Normalize and project the entire feature set
+        norm_dtype = norm_layer.weight.dtype
+        proj_dtype = proj_layer.weight.dtype
+        normed_features = norm_layer(processed_features.to(norm_dtype))
+        return proj_layer(normed_features.to(proj_dtype))
+    def _run_snapshot_encoders(self,
+                               batch: Dict[str, Any],
+                               final_wallet_embeddings_raw: torch.Tensor,
+                               wallet_addr_to_batch_idx: Dict[str, int]) -> Dict[str, torch.Tensor]:
+        """
+        Runs snapshot-style encoders that process raw data into embeddings.
+        This is now truly end-to-end.
+        """
+        device = self.device
+        all_holder_snapshot_embeds = []
+        # Iterate through each HolderSnapshot event's raw data
+        for raw_holder_list in batch['holder_snapshot_raw_data']:
+            processed_holder_data = []
+            for holder in raw_holder_list:
+                wallet_addr = holder['wallet']
+                # Get the graph-updated wallet embedding using its index
+                wallet_idx = wallet_addr_to_batch_idx.get(wallet_addr, 0) # 0 is padding
+                if wallet_idx > 0: # If it's a valid wallet
+                    wallet_embedding = final_wallet_embeddings_raw[wallet_idx - 1] # Adjust for 1-based indexing
+                    processed_holder_data.append({
+                        'wallet_embedding': wallet_embedding,
+                        'pct': holder['holding_pct']
+                    })
+            # Pass the processed data to the HolderDistributionEncoder
+            all_holder_snapshot_embeds.append(self.holder_dist_encoder(processed_holder_data))
+        return {"holder_snapshot": torch.cat(all_holder_snapshot_embeds, dim=0) if all_holder_snapshot_embeds else torch.empty(0, self.d_model, device=device, dtype=self.dtype)}
+    def _run_dynamic_encoders(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        """
+        Runs all dynamic encoders and returns a dictionary of raw, unprojected embeddings.
+        """
+        device = self.device
+        # --- NEW: Get pre-computed embedding indices ---
+        token_encoder_inputs = batch['token_encoder_inputs']
+        wallet_encoder_inputs = batch['wallet_encoder_inputs']
+        # The pre-computed embedding pool for the whole batch
+        embedding_pool = batch['embedding_pool']
+        ohlc_price_tensors = batch['ohlc_price_tensors'].to(device, self.dtype)
+        ohlc_interval_ids = batch['ohlc_interval_ids'].to(device)
+        graph_updater_links = batch['graph_updater_links']
+        # 1a. Encode Tokens
+        # --- FIXED: Check for a key that still exists ---
+        if token_encoder_inputs['name_embed_indices'].numel() > 0:
+            # --- AGGRESSIVE LOGGING ---
+            print("\n--- [Oracle DynamicEncoder LOG] ---")
+            print(f"[Oracle LOG] embedding_pool shape: {embedding_pool.shape}")
+            print(f"[Oracle LOG] name_embed_indices (shape {token_encoder_inputs['name_embed_indices'].shape}):\n{token_encoder_inputs['name_embed_indices']}")
+            print(f"[Oracle LOG] symbol_embed_indices (shape {token_encoder_inputs['symbol_embed_indices'].shape}):\n{token_encoder_inputs['symbol_embed_indices']}")
+            print(f"[Oracle LOG] image_embed_indices (shape {token_encoder_inputs['image_embed_indices'].shape}):\n{token_encoder_inputs['image_embed_indices']}")
+            print("--- [Oracle LOG] Calling F.embedding and TokenEncoder... ---")
+            # --- END LOGGING ---
+            # --- NEW: Gather pre-computed embeddings and pass to encoder ---
+            # --- CRITICAL FIX: Remove keys that are not part of the TokenEncoder's signature ---
+            encoder_args = token_encoder_inputs.copy()
+            encoder_args.pop('_addresses_for_lookup', None) # This key is for the WalletEncoder
+            encoder_args.pop('name_embed_indices', None)
+            encoder_args.pop('symbol_embed_indices', None)
+            encoder_args.pop('image_embed_indices', None)
+            # --- SAFETY: Create a padded view of the embedding pool and map missing indices (-1) to pad ---
+            if embedding_pool.numel() > 0:
+                pad_row = torch.zeros(1, embedding_pool.size(1), device=device, dtype=embedding_pool.dtype)
+                pool_padded = torch.cat([pad_row, embedding_pool], dim=0)
+                def pad_and_lookup(idx_tensor: torch.Tensor) -> torch.Tensor:
+                    # Map valid indices >=0 to +1 (shift), invalid (<0) to 0 (pad)
+                    shifted = torch.where(idx_tensor >= 0, idx_tensor + 1, torch.zeros_like(idx_tensor))
+                    return F.embedding(shifted, pool_padded)
+                name_embeds = pad_and_lookup(token_encoder_inputs['name_embed_indices'])
+                symbol_embeds = pad_and_lookup(token_encoder_inputs['symbol_embed_indices'])
+                image_embeds = pad_and_lookup(token_encoder_inputs['image_embed_indices'])
+            else:
+                # Empty pool: provide zeros with correct shapes
+                n = token_encoder_inputs['name_embed_indices'].shape[0]
+                d = self.multi_modal_dim
+                zeros = torch.zeros(n, d, device=device, dtype=self.dtype)
+                name_embeds = zeros
+                symbol_embeds = zeros
+                image_embeds = zeros
+            batch_token_embeddings_unupd = self.token_encoder(
+                name_embeds=name_embeds,
+                symbol_embeds=symbol_embeds,
+                image_embeds=image_embeds,
+                # Pass all other keys like protocol_ids, is_vanity_flags, etc.
+                **encoder_args
+            )
+        else:
+            batch_token_embeddings_unupd = torch.empty(0, self.token_encoder.output_dim, device=device, dtype=self.dtype)
+        # 1b. Encode Wallets
+        if wallet_encoder_inputs['profile_rows']:
+            temp_token_lookup = {
+                addr: batch_token_embeddings_unupd[i]
+                for i, addr in enumerate(batch['token_encoder_inputs']['_addresses_for_lookup']) # Use helper key
+            }
+            initial_wallet_embeddings = self.wallet_encoder(
+                **wallet_encoder_inputs,
+                token_vibe_lookup=temp_token_lookup,
+                embedding_pool=embedding_pool
+            )
+        else:
+            initial_wallet_embeddings = torch.empty(0, self.wallet_encoder.d_model, device=device, dtype=self.dtype)
+        # 1c. Encode OHLC
+        if ohlc_price_tensors.shape[0] > 0:
+            batch_ohlc_embeddings_raw = self.ohlc_embedder(ohlc_price_tensors, ohlc_interval_ids)
+        else:
+            batch_ohlc_embeddings_raw = torch.empty(0, self.ohlc_embedder.output_dim, device=device, dtype=self.dtype)
+        # 1d. Run Graph Updater
+        pad_wallet_raw = self.pad_wallet_emb.to(self.dtype)
+        pad_token_raw = self.pad_token_emb.to(self.dtype)
+        padded_wallet_tensor = torch.cat([pad_wallet_raw, initial_wallet_embeddings], dim=0)
+        padded_token_tensor = torch.cat([pad_token_raw, batch_token_embeddings_unupd], dim=0)
+        x_dict_initial = {}
+        if padded_wallet_tensor.shape[0] > 1: x_dict_initial['wallet'] = padded_wallet_tensor
+        if padded_token_tensor.shape[0] > 1: x_dict_initial['token'] = padded_token_tensor
+        if x_dict_initial and graph_updater_links:
+            final_entity_embeddings_dict = self.graph_updater(x_dict_initial, graph_updater_links)
+            final_padded_wallet_embs = final_entity_embeddings_dict.get('wallet', padded_wallet_tensor)
+            final_padded_token_embs = final_entity_embeddings_dict.get('token', padded_token_tensor)
+        else:
+            final_padded_wallet_embs = padded_wallet_tensor
+            final_padded_token_embs = padded_token_tensor
+        # Strip padding before returning
+        final_wallet_embeddings_raw = final_padded_wallet_embs[1:]
+        final_token_embeddings_raw = final_padded_token_embs[1:]
+        return {
+            "wallet": final_wallet_embeddings_raw,
+            "token": final_token_embeddings_raw,
+            "ohlc": batch_ohlc_embeddings_raw
+        }
+    def _project_and_gather_embeddings(self, raw_embeds: Dict[str, torch.Tensor], batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Projects raw embeddings to d_model and gathers them into sequence-aligned tensors.
+        """
+        # Project raw embeddings to d_model
+        final_wallet_proj = self.wallet_proj(raw_embeds['wallet'])
+        final_token_proj = self.token_proj(raw_embeds['token'])
+        final_ohlc_proj = self.ohlc_proj(raw_embeds['ohlc'])
+        # Project padding embeddings to d_model
+        pad_wallet = self.wallet_proj(self.pad_wallet_emb.to(self.dtype))
+        pad_token = self.token_proj(self.pad_token_emb.to(self.dtype))
+        pad_ohlc = self.ohlc_proj(self.pad_ohlc_emb.to(self.dtype))
+        pad_holder_snapshot = self.pad_holder_snapshot_emb.to(self.dtype) # Already d_model
+        # --- NEW: Project pre-computed embeddings and create lookup ---
+        final_precomputed_proj = self.precomputed_proj(batch['embedding_pool'])
+        pad_precomputed = self.precomputed_proj(self.pad_precomputed_emb.to(self.dtype))
+        final_precomputed_lookup = torch.cat([pad_precomputed, final_precomputed_proj], dim=0)
+        # Create final lookup tables with padding at index 0
+        final_wallet_lookup = torch.cat([pad_wallet, final_wallet_proj], dim=0)
+        final_token_lookup = torch.cat([pad_token, final_token_proj], dim=0)
+        final_ohlc_lookup = torch.cat([pad_ohlc, final_ohlc_proj], dim=0)
+        # --- NEW: Add Role Embeddings ---
+        main_role_emb = self.token_role_embedding(torch.tensor(self.main_token_role_id, device=self.device))
+        quote_role_emb = self.token_role_embedding(torch.tensor(self.quote_token_role_id, device=self.device))
+        trending_role_emb = self.token_role_embedding(torch.tensor(self.trending_token_role_id, device=self.device))
+        # Gather base embeddings
+        gathered_main_token_embs = F.embedding(batch['token_indices'], final_token_lookup)
+        gathered_quote_token_embs = F.embedding(batch['quote_token_indices'], final_token_lookup)
+        gathered_trending_token_embs = F.embedding(batch['trending_token_indices'], final_token_lookup)
+        gathered_boosted_token_embs = F.embedding(batch['boosted_token_indices'], final_token_lookup)
+        # --- NEW: Handle HolderSnapshot ---
+        final_holder_snapshot_lookup = torch.cat([pad_holder_snapshot, raw_embeds['holder_snapshot']], dim=0)
+        # Gather embeddings for each event in the sequence
+        return {
+            "wallet": F.embedding(batch['wallet_indices'], final_wallet_lookup),
+            "token": gathered_main_token_embs, # This is the baseline, no role needed
+            "ohlc": F.embedding(batch['ohlc_indices'], final_ohlc_lookup),
+            "original_author": F.embedding(batch['original_author_indices'], final_wallet_lookup), # NEW
+            "dest_wallet": F.embedding(batch['dest_wallet_indices'], final_wallet_lookup), # Also gather dest wallet
+            "quote_token": gathered_quote_token_embs + quote_role_emb,
+            "trending_token": gathered_trending_token_embs + trending_role_emb,
+            "boosted_token": gathered_boosted_token_embs + trending_role_emb, # Same role as trending
+            "holder_snapshot": F.embedding(batch['holder_snapshot_indices'], final_holder_snapshot_lookup), # NEW
+            "precomputed": final_precomputed_lookup # NEW: Pass the full lookup table
+        }
+    def _get_transfer_specific_embeddings(self, batch: Dict[str, torch.Tensor], gathered_embeds: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for Transfer/LargeTransfer events.
+        """
+        device = self.device
+        transfer_numerical_features = batch['transfer_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: token_amount (idx 0), priority_fee (idx 3)
+        # Linear scale: transfer_pct_of_total_supply (idx 1), transfer_pct_of_holding (idx 2)
+        projected_transfer_features = self._normalize_and_project(
+            transfer_numerical_features, self.transfer_num_norm, self.transfer_num_proj, log_indices=[0, 3]
+        )
+        # Create a mask for Transfer/LargeTransfer events
+        transfer_event_ids = [self.event_type_to_id.get('Transfer', -1), self.event_type_to_id.get('LargeTransfer', -1)] # ADDED LargeTransfer
+        transfer_mask = torch.isin(event_type_ids, torch.tensor(transfer_event_ids, device=device)).unsqueeze(-1)
+        # Combine destination wallet and numerical features, then apply mask
+        return (gathered_embeds['dest_wallet'] + projected_transfer_features) * transfer_mask
+    def _get_trade_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for Trade events.
+        """
+        device = self.device
+        trade_numerical_features = batch['trade_numerical_features']
+        trade_dex_ids = batch['trade_dex_ids'] # NEW
+        trade_direction_ids = batch['trade_direction_ids']
+        trade_mev_protection_ids = batch['trade_mev_protection_ids'] # NEW
+        trade_is_bundle_ids = batch['trade_is_bundle_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: sol_amount (idx 0), priority_fee (idx 1), total_usd (idx 7)
+        # Linear scale: pcts, slippage, price_impact, success flags
+        projected_trade_features = self._normalize_and_project(
+            trade_numerical_features, self.trade_num_norm, self.trade_num_proj, log_indices=[0, 1, 7]
+        )
+        # --- CORRECTED: This layer now handles both generic and large trades ---
+        trade_event_names = ['Trade', 'LargeTrade']
+        trade_event_ids = [self.event_type_to_id.get(name, -1) for name in trade_event_names]
+        # Create mask where event_type_id is one of the trade event ids
+        trade_mask = torch.isin(event_type_ids, torch.tensor(trade_event_ids, device=device)).unsqueeze(-1)
+        # --- NEW: Get embedding for the categorical dex_id ---
+        dex_id_embeds = self.dex_platform_embedding(trade_dex_ids)
+        direction_embeds = self.trade_direction_embedding(trade_direction_ids)
+        mev_embeds = self.mev_protection_embedding(trade_mev_protection_ids) # NEW
+        bundle_embeds = self.is_bundle_embedding(trade_is_bundle_ids) # NEW
+        return (projected_trade_features + dex_id_embeds + direction_embeds + mev_embeds + bundle_embeds) * trade_mask
+    def _get_deployer_trade_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for Deployer_Trade events using its own layers.
+        """
+        device = self.device
+        deployer_trade_numerical_features = batch['deployer_trade_numerical_features']
+        trade_dex_ids = batch['trade_dex_ids'] # NEW: Re-use the same ID tensor
+        trade_direction_ids = batch['trade_direction_ids']
+        trade_mev_protection_ids = batch['trade_mev_protection_ids'] # NEW
+        trade_is_bundle_ids = batch['trade_is_bundle_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: sol_amount (idx 0), priority_fee (idx 1), total_usd (idx 7)
+        projected_deployer_trade_features = self._normalize_and_project(
+            deployer_trade_numerical_features, self.deployer_trade_num_norm, self.deployer_trade_num_proj, log_indices=[0, 1, 7]
+        )
+        dex_id_embeds = self.dex_platform_embedding(trade_dex_ids)
+        direction_embeds = self.trade_direction_embedding(trade_direction_ids)
+        mev_embeds = self.mev_protection_embedding(trade_mev_protection_ids) # NEW
+        bundle_embeds = self.is_bundle_embedding(trade_is_bundle_ids) # NEW
+        deployer_trade_mask = (event_type_ids == self.event_type_to_id.get('Deployer_Trade', -1)).unsqueeze(-1)
+        return (projected_deployer_trade_features + dex_id_embeds + direction_embeds + mev_embeds + bundle_embeds) * deployer_trade_mask
+    def _get_smart_wallet_trade_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for SmartWallet_Trade events using its own layers.
+        """
+        device = self.device
+        smart_wallet_trade_numerical_features = batch['smart_wallet_trade_numerical_features']
+        trade_dex_ids = batch['trade_dex_ids'] # NEW: Re-use the same ID tensor
+        trade_direction_ids = batch['trade_direction_ids']
+        trade_mev_protection_ids = batch['trade_mev_protection_ids'] # NEW
+        trade_is_bundle_ids = batch['trade_is_bundle_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: sol_amount (idx 0), priority_fee (idx 1), total_usd (idx 7)
+        projected_features = self._normalize_and_project(
+            smart_wallet_trade_numerical_features, self.smart_wallet_trade_num_norm, self.smart_wallet_trade_num_proj, log_indices=[0, 1, 7]
+        )
+        dex_id_embeds = self.dex_platform_embedding(trade_dex_ids)
+        direction_embeds = self.trade_direction_embedding(trade_direction_ids)
+        mev_embeds = self.mev_protection_embedding(trade_mev_protection_ids) # NEW
+        bundle_embeds = self.is_bundle_embedding(trade_is_bundle_ids) # NEW
+        mask = (event_type_ids == self.event_type_to_id.get('SmartWallet_Trade', -1)).unsqueeze(-1)
+        return (projected_features + dex_id_embeds + direction_embeds + mev_embeds + bundle_embeds) * mask
+    def _get_pool_created_specific_embeddings(self, batch: Dict[str, torch.Tensor], gathered_embeds: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for PoolCreated events.
+        """
+        device = self.device
+        pool_created_numerical_features = batch['pool_created_numerical_features']
+        pool_created_protocol_ids = batch['pool_created_protocol_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: base_amount (idx 0), quote_amount (idx 1)
+        # Linear scale: pcts (idx 2, 3)
+        projected_features = self._normalize_and_project(
+            pool_created_numerical_features, self.pool_created_num_norm, self.pool_created_num_proj, log_indices=[0, 1]
+        )
+        # --- NEW: Get embedding for the categorical protocol_id ---
+        protocol_id_embeds = self.protocol_embedding(pool_created_protocol_ids)
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('PoolCreated', -1)).unsqueeze(-1)
+        # Combine Quote Token embedding with projected numericals
+        return (gathered_embeds['quote_token'] + projected_features + protocol_id_embeds) * mask
+    def _get_liquidity_change_specific_embeddings(self, batch: Dict[str, torch.Tensor], gathered_embeds: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for LiquidityChange events.
+        """
+        device = self.device
+        liquidity_change_numerical_features = batch['liquidity_change_numerical_features']
+        liquidity_change_type_ids = batch['liquidity_change_type_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: quote_amount (idx 0)
+        projected_features = self._normalize_and_project(
+            liquidity_change_numerical_features, self.liquidity_change_num_norm, self.liquidity_change_num_proj, log_indices=[0]
+        )
+        # --- NEW: Get embedding for the categorical change_type_id ---
+        change_type_embeds = self.liquidity_change_type_embedding(liquidity_change_type_ids)
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('LiquidityChange', -1)).unsqueeze(-1)
+        # Combine Quote Token embedding with projected numericals
+        return (gathered_embeds['quote_token'] + projected_features + change_type_embeds) * mask
+    def _get_fee_collected_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for FeeCollected events.
+        """
+        device = self.device
+        fee_collected_numerical_features = batch['fee_collected_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Single amount, log-scale ---
+        projected_features = self._normalize_and_project(
+            fee_collected_numerical_features, self.fee_collected_num_norm, self.fee_collected_num_proj, log_indices=[0]
+        )
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('FeeCollected', -1)).unsqueeze(-1)
+        return projected_features * mask
+    def _get_token_burn_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for TokenBurn events.
+        """
+        device = self.device
+        token_burn_numerical_features = batch['token_burn_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: amount_tokens_burned (idx 1)
+        # Linear scale: amount_pct_of_total_supply (idx 0)
+        projected_features = self._normalize_and_project(
+            token_burn_numerical_features, self.token_burn_num_norm, self.token_burn_num_proj, log_indices=[1]
+        )
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('TokenBurn', -1)).unsqueeze(-1)
+        return projected_features * mask
+    def _get_supply_lock_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for SupplyLock events.
+        """
+        device = self.device
+        supply_lock_numerical_features = batch['supply_lock_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: lock_duration (idx 1)
+        # Linear scale: amount_pct_of_total_supply (idx 0)
+        projected_features = self._normalize_and_project(
+            supply_lock_numerical_features, self.supply_lock_num_norm, self.supply_lock_num_proj, log_indices=[1]
+        )
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('SupplyLock', -1)).unsqueeze(-1)
+        return projected_features * mask
+    def _get_onchain_snapshot_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for OnChain_Snapshot events.
+        """
+        device = self.device
+        onchain_snapshot_numerical_features = batch['onchain_snapshot_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: counts, market_cap, liquidity, volume, fees (almost all)
+        # Linear scale: growth_rate, holder_pcts (indices 3, 4, 5, 6, 7)
+        projected_features = self._normalize_and_project(
+            onchain_snapshot_numerical_features, self.onchain_snapshot_num_norm, self.onchain_snapshot_num_proj, log_indices=[0, 1, 2, 8, 9, 10, 11, 12, 13]
+        )
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('OnChain_Snapshot', -1)).unsqueeze(-1)
+        return projected_features * mask
+    def _get_trending_token_specific_embeddings(self, batch: Dict[str, torch.Tensor], gathered_embeds: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for TrendingToken events.
+        """
+        device = self.device
+        trending_token_numerical_features = batch['trending_token_numerical_features']
+        trending_token_source_ids = batch['trending_token_source_ids'] # NEW
+        trending_token_timeframe_ids = batch['trending_token_timeframe_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Rank is already inverted (0-1), so treat as linear ---
+        projected_features = self._normalize_and_project(
+            trending_token_numerical_features, self.trending_token_num_norm, self.trending_token_num_proj, log_indices=None
+        )
+        # --- NEW: Get embeddings for categorical IDs ---
+        source_embeds = self.trending_list_source_embedding(trending_token_source_ids)
+        timeframe_embeds = self.trending_timeframe_embedding(trending_token_timeframe_ids)
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('TrendingToken', -1)).unsqueeze(-1)
+        # Combine Trending Token embedding with its projected numericals
+        return (gathered_embeds['trending_token'] + projected_features + source_embeds + timeframe_embeds) * mask
+    def _get_boosted_token_specific_embeddings(self, batch: Dict[str, torch.Tensor], gathered_embeds: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for BoostedToken events.
+        """
+        device = self.device
+        boosted_token_numerical_features = batch['boosted_token_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: Selectively log-scale features ---
+        # Log scale: total_boost_amount (idx 0)
+        # Linear scale: inverted rank (idx 1)
+        projected_features = self._normalize_and_project(
+            boosted_token_numerical_features, self.boosted_token_num_norm, self.boosted_token_num_proj, log_indices=[0]
+        )
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('BoostedToken', -1)).unsqueeze(-1)
+        # Combine Boosted Token embedding with its projected numericals
+        return (gathered_embeds['boosted_token'] + projected_features) * mask
+    def _get_dexboost_paid_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Calculates the special embeddings for DexBoost_Paid events.
+        """
+        device = self.device
+        dexboost_paid_numerical_features = batch['dexboost_paid_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: All features are amounts, so log-scale all ---
+        projected_features = self._normalize_and_project(
+            dexboost_paid_numerical_features, self.dexboost_paid_num_norm, self.dexboost_paid_num_proj, log_indices=[0, 1]
+        )
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('DexBoost_Paid', -1)).unsqueeze(-1)
+        return projected_features * mask
+    def _get_alphagroup_call_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles AlphaGroup_Call events by looking up the group_id embedding.
+        """
+        device = self.device
+        group_ids = batch['alpha_group_ids']
+        event_type_ids = batch['event_type_ids']
+        # Look up the embedding for the group ID
+        group_embeds = self.alpha_group_embedding(group_ids)
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('AlphaGroup_Call', -1)).unsqueeze(-1)
+        return group_embeds * mask
+    def _get_channel_call_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles Channel_Call events by looking up the channel_id embedding.
+        """
+        device = self.device
+        channel_ids = batch['channel_ids']
+        event_type_ids = batch['event_type_ids']
+        channel_embeds = self.call_channel_embedding(channel_ids)
+        mask = (event_type_ids == self.event_type_to_id.get('Channel_Call', -1)).unsqueeze(-1)
+        return channel_embeds * mask
+    def _get_cexlisting_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles CexListing events by looking up the exchange_id embedding.
+        """
+        device = self.device
+        exchange_ids = batch['exchange_ids']
+        event_type_ids = batch['event_type_ids']
+        exchange_embeds = self.cex_listing_embedding(exchange_ids)
+        mask = (event_type_ids == self.event_type_to_id.get('CexListing', -1)).unsqueeze(-1)
+        return exchange_embeds * mask
+    def _get_chainsnapshot_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles ChainSnapshot events.
+        """
+        device = self.device
+        numerical_features = batch['chainsnapshot_numerical_features']
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: All features are amounts/prices, so log-scale all ---
+        projected_features = self._normalize_and_project(
+            numerical_features, self.chainsnapshot_num_norm, self.chainsnapshot_num_proj, log_indices=[0, 1]
+        )
+        mask = (event_type_ids == self.event_type_to_id.get('ChainSnapshot', -1)).unsqueeze(-1)
+        return projected_features * mask
+    def _get_lighthousesnapshot_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles Lighthouse_Snapshot events.
+        """
+        device = self.device
+        numerical_features = batch['lighthousesnapshot_numerical_features']
+        protocol_ids = batch['lighthousesnapshot_protocol_ids'] # NEW
+        timeframe_ids = batch['lighthousesnapshot_timeframe_ids'] # NEW
+        event_type_ids = batch['event_type_ids']
+        # --- FIXED: All features are counts/volumes, so log-scale all ---
+        projected_features = self._normalize_and_project(
+            numerical_features, self.lighthousesnapshot_num_norm, self.lighthousesnapshot_num_proj, log_indices=[0, 1, 2, 3, 4]
+        )
+        # --- NEW: Get embeddings for categorical IDs ---
+        # Re-use the main protocol embedding layer
+        protocol_embeds = self.protocol_embedding(protocol_ids)
+        timeframe_embeds = self.lighthouse_timeframe_embedding(timeframe_ids)
+        mask = (event_type_ids == self.event_type_to_id.get('Lighthouse_Snapshot', -1)).unsqueeze(-1)
+        return (projected_features + protocol_embeds + timeframe_embeds) * mask
+    def _get_migrated_specific_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles Migrated events by looking up the protocol_id embedding.
+        """
+        device = self.device
+        protocol_ids = batch['migrated_protocol_ids']
+        event_type_ids = batch['event_type_ids']
+        # Look up the embedding for the protocol ID
+        protocol_embeds = self.protocol_embedding(protocol_ids)
+        # Create mask for the event
+        mask = (event_type_ids == self.event_type_to_id.get('Migrated', -1)).unsqueeze(-1)
+        return protocol_embeds * mask
+    def _get_special_context_embeddings(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Handles special context tokens like 'Middle' and 'RECENT' by adding their unique learnable embeddings.
+        """
+        device = self.device
+        event_type_ids = batch['event_type_ids']
+        B, L = event_type_ids.shape
+        middle_id = self.event_type_to_id.get('Middle', -1)
+        recent_id = self.event_type_to_id.get('RECENT', -1)
+        middle_mask = (event_type_ids == middle_id)
+        recent_mask = (event_type_ids == recent_id)
+        middle_emb = self.special_context_embedding(torch.tensor(self.special_context_tokens['Middle'], device=device))
+        recent_emb = self.special_context_embedding(torch.tensor(self.special_context_tokens['RECENT'], device=device))
+        # Add the embeddings at the correct locations
+        return middle_mask.unsqueeze(-1) * middle_emb + recent_mask.unsqueeze(-1) * recent_emb
+    def _pool_hidden_states(self,
+                            hidden_states: torch.Tensor,
+                            attention_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Pools variable-length hidden states into a single embedding per sequence by
+        selecting the last non-masked token for each batch element.
+        """
+        if hidden_states.size(0) == 0:
+            return torch.empty(0, self.d_model, device=hidden_states.device, dtype=hidden_states.dtype)
+        seq_lengths = attention_mask.long().sum(dim=1)
+        last_indices = torch.clamp(seq_lengths - 1, min=0)
+        batch_indices = torch.arange(hidden_states.size(0), device=hidden_states.device)
+        return hidden_states[batch_indices, last_indices]
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
+        device = self.device
+        # Unpack core sequence tensors
+        event_type_ids = batch['event_type_ids'].to(device)
+        timestamps_float = batch['timestamps_float'].to(device)
+        relative_ts = batch['relative_ts'].to(device, self.dtype)
+        attention_mask = batch['attention_mask'].to(device)
+        B, L = event_type_ids.shape
+        if B == 0 or L == 0:
+            print("Warning: Received empty batch in Oracle forward.")
+            empty_hidden = torch.empty(0, L, self.d_model, device=device, dtype=self.dtype)
+            empty_mask = torch.empty(0, L, device=device, dtype=torch.long)
+            empty_quantiles = torch.empty(0, self.num_outputs, device=device, dtype=self.dtype)
+            return {
+                'quantile_logits': empty_quantiles,
+                'pooled_states': torch.empty(0, self.d_model, device=device, dtype=self.dtype),
+                'hidden_states': empty_hidden,
+                'attention_mask': empty_mask
+            }
+        # === 1. Run Dynamic Encoders (produces graph-updated entity embeddings) ===
+        dynamic_raw_embeds = self._run_dynamic_encoders(batch)
+        # === 2. Run Snapshot Encoders (uses dynamic_raw_embeds) ===
+        wallet_addr_to_batch_idx = batch['wallet_addr_to_batch_idx']
+        snapshot_raw_embeds = self._run_snapshot_encoders(batch, dynamic_raw_embeds['wallet'], wallet_addr_to_batch_idx)
+        # === 3. Project Raw Embeddings and Gather for Sequence ===
+        raw_embeds = {**dynamic_raw_embeds, **snapshot_raw_embeds}
+        gathered_embeds = self._project_and_gather_embeddings(raw_embeds, batch)
+        # === 4. Assemble Final `inputs_embeds` ===
+        event_embeds = self.event_type_embedding(event_type_ids)
+        ts_embeds = self.time_proj(self.time_encoder(timestamps_float))
+        # Stabilize relative time: minutes scale + signed log1p + LayerNorm before projection
+        relative_ts_fp32 = batch['relative_ts'].to(device, torch.float32)
+        rel_ts_minutes = relative_ts_fp32 / 60.0
+        rel_ts_processed = torch.sign(rel_ts_minutes) * torch.log1p(torch.abs(rel_ts_minutes))
+        # Match LayerNorm parameter dtype, then match Linear parameter dtype
+        norm_dtype = self.rel_ts_norm.weight.dtype
+        proj_dtype = self.rel_ts_proj.weight.dtype
+        rel_ts_normed = self.rel_ts_norm(rel_ts_processed.to(norm_dtype))
+        rel_ts_embeds = self.rel_ts_proj(rel_ts_normed.to(proj_dtype))
+        # Get special embeddings for Transfer events
+        transfer_specific_embeds = self._get_transfer_specific_embeddings(batch, gathered_embeds)
+        # Get special embeddings for Trade events
+        trade_specific_embeds = self._get_trade_specific_embeddings(batch)
+        # Get special embeddings for Deployer Trade events
+        deployer_trade_specific_embeds = self._get_deployer_trade_specific_embeddings(batch)
+        # Get special embeddings for Smart Wallet Trade events
+        smart_wallet_trade_specific_embeds = self._get_smart_wallet_trade_specific_embeddings(batch)
+        # Get special embeddings for PoolCreated events
+        pool_created_specific_embeds = self._get_pool_created_specific_embeddings(batch, gathered_embeds)
+        # Get special embeddings for LiquidityChange events
+        liquidity_change_specific_embeds = self._get_liquidity_change_specific_embeddings(batch, gathered_embeds)
+        # Get special embeddings for FeeCollected events
+        fee_collected_specific_embeds = self._get_fee_collected_specific_embeddings(batch)
+        # Get special embeddings for TokenBurn events
+        token_burn_specific_embeds = self._get_token_burn_specific_embeddings(batch)
+        # Get special embeddings for SupplyLock events
+        supply_lock_specific_embeds = self._get_supply_lock_specific_embeddings(batch)
+        # Get special embeddings for OnChain_Snapshot events
+        onchain_snapshot_specific_embeds = self._get_onchain_snapshot_specific_embeddings(batch)
+        # Get special embeddings for TrendingToken events
+        trending_token_specific_embeds = self._get_trending_token_specific_embeddings(batch, gathered_embeds)
+        # Get special embeddings for BoostedToken events
+        boosted_token_specific_embeds = self._get_boosted_token_specific_embeddings(batch, gathered_embeds)
+        # Get special embeddings for DexBoost_Paid events
+        dexboost_paid_specific_embeds = self._get_dexboost_paid_specific_embeddings(batch)
+        # --- NEW: Get embeddings for Tracker events ---
+        alphagroup_call_specific_embeds = self._get_alphagroup_call_specific_embeddings(batch)
+        channel_call_specific_embeds = self._get_channel_call_specific_embeddings(batch)
+        cexlisting_specific_embeds = self._get_cexlisting_specific_embeddings(batch)
+        # --- NEW: Get embeddings for Chain and Lighthouse Snapshots ---
+        chainsnapshot_specific_embeds = self._get_chainsnapshot_specific_embeddings(batch)
+        lighthousesnapshot_specific_embeds = self._get_lighthousesnapshot_specific_embeddings(batch)
+        migrated_specific_embeds = self._get_migrated_specific_embeddings(batch)
+        # --- NEW: Handle DexProfile_Updated flags separately ---
+        dexprofile_updated_flags = batch['dexprofile_updated_flags']
+        dexprofile_flags_embeds = self.dexprofile_updated_flags_proj(dexprofile_updated_flags.to(self.dtype))
+        # --- REFACTORED: All text-based events are handled by the SocialEncoder ---
+        # This single call will replace the inefficient loops for social, dexprofile, and global trending events.
+        # The SocialEncoder's forward pass will need to be updated to handle this.
+        textual_event_embeds = self.social_encoder(
+            batch=batch,
+            gathered_embeds=gathered_embeds
+        )
+        # --- NEW: Get embeddings for special context injection tokens ---
+        special_context_embeds = self._get_special_context_embeddings(batch)
+        # --- Combine all features ---
+        # Sum in float32 for numerical stability, then cast back to model dtype
+        components = [
+            event_embeds, ts_embeds, rel_ts_embeds,
+            gathered_embeds['wallet'], gathered_embeds['token'], gathered_embeds['original_author'], gathered_embeds['ohlc'],
+            transfer_specific_embeds, trade_specific_embeds, deployer_trade_specific_embeds, smart_wallet_trade_specific_embeds,
+            pool_created_specific_embeds, liquidity_change_specific_embeds, fee_collected_specific_embeds,
+            token_burn_specific_embeds, supply_lock_specific_embeds, onchain_snapshot_specific_embeds,
+            trending_token_specific_embeds, boosted_token_specific_embeds, dexboost_paid_specific_embeds,
+            alphagroup_call_specific_embeds, channel_call_specific_embeds, cexlisting_specific_embeds,
+            migrated_specific_embeds, special_context_embeds, gathered_embeds['holder_snapshot'], textual_event_embeds,
+            dexprofile_flags_embeds, chainsnapshot_specific_embeds, lighthousesnapshot_specific_embeds
+        ]
+        inputs_embeds = sum([t.float() for t in components]).to(self.dtype)
+        hf_attention_mask = attention_mask.to(device=device, dtype=torch.long)
+        outputs = self.model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=hf_attention_mask,
+            return_dict=True
+        )
+        sequence_hidden = outputs.last_hidden_state
+        pooled_states = self._pool_hidden_states(sequence_hidden, hf_attention_mask)
+        quantile_logits = self.quantile_head(pooled_states)
+        return {
+            'quantile_logits': quantile_logits,
+            'pooled_states': pooled_states,
+            'hidden_states': sequence_hidden,
+            'attention_mask': hf_attention_mask
+        }

models/multi_modal_processor.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# multi_modal_processor.py
+import torch
+import torch.nn.functional as F
+from transformers import AutoModel, AutoProcessor, AutoConfig
+from typing import List, Union
+from PIL import Image
+import requests
+import io
+import os
+import traceback
+import numpy as np
+# Suppress warnings
+os.environ["TRANSFORMERS_VERBOSITY"] = "error"
+class MultiModalEncoder:
+    """
+    Encodes text OR images into a shared, NORMALIZED embedding space
+    using google/siglip-so400m-patch16-256-i18n.
+    This class is intended for creating embeddings for vector search.
+    """
+    def __init__(self, model_id="google/siglip-so400m-patch16-256-i18n", dtype: torch.dtype = torch.bfloat16):
+        self.model_id = model_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.dtype = dtype
+        try:
+            # --- SigLIP Loading with Config Fix ---
+            self.processor = AutoProcessor.from_pretrained(
+                self.model_id,
+                use_fast=True
+            )
+            config = AutoConfig.from_pretrained(self.model_id)
+            if not hasattr(config, 'projection_dim'):
+                # print("❗ Config missing projection_dim, patching...")
+                config.projection_dim = config.text_config.hidden_size
+            self.model = AutoModel.from_pretrained(
+                self.model_id,
+                config=config,
+                dtype=self.dtype, # Use torch_dtype for from_pretrained
+                trust_remote_code=False
+            ).to(self.device).eval()
+            # -----------------------------------------------
+            self.embedding_dim = config.projection_dim
+        except Exception as e:
+            print(f"❌ Failed to load SigLIP model or components: {e}")
+            traceback.print_exc()
+            raise
+    @torch.no_grad()
+    def __call__(self, x: Union[List[str], List[Image.Image]]) -> torch.Tensor:
+        """
+        Encode a batch of text or images into normalized [batch_size, embedding_dim] vectors.
+        This is correct for storing in a vector DB for cosine similarity.
+        """
+        if not x:
+            return torch.empty(0, self.embedding_dim).to(self.device)
+        is_text = isinstance(x[0], str)
+        autocast_dtype = self.dtype if self.dtype in [torch.float16, torch.bfloat16] else None
+        print(f"\n[MME LOG] ENTERING __call__ for {'TEXT' if is_text else 'IMAGE'} batch of size {len(x)}")
+        print(f"[MME LOG] Input data preview: {str(x[0])[:100] if is_text else x[0]}")
+        with torch.amp.autocast(device_type=self.device, enabled=(self.device == 'cuda' and autocast_dtype is not None), dtype=autocast_dtype):
+            try:
+                if is_text:
+                    inputs = self.processor(
+                        text=x,
+                        return_tensors="pt",
+                        padding="max_length",
+                        truncation=True
+                    ).to(self.device)
+                    print(f"[MME LOG] Text processor output shape: {inputs['input_ids'].shape}")
+                    embeddings = self.model.get_text_features(**inputs)
+                else:
+                    rgb_images = [img.convert("RGB") if img.mode != 'RGB' else img for img in x]
+                    inputs = self.processor(
+                        images=rgb_images,
+                        return_tensors="pt"
+                    ).to(self.device)
+                    if 'pixel_values' in inputs and inputs['pixel_values'].dtype != self.dtype:
+                        inputs['pixel_values'] = inputs['pixel_values'].to(self.dtype)
+                    embeddings = self.model.get_image_features(**inputs)
+                print(f"[MME LOG] Raw model output embeddings shape: {embeddings.shape}, dtype: {embeddings.dtype}")
+                # <<< THIS IS THE FIX. I accidentally removed this.
+                # Normalize in float32 for numerical stability
+                embeddings = F.normalize(embeddings.float(), p=2, dim=-1)
+                print(f"[MME LOG] Normalized embeddings shape: {embeddings.shape}, dtype: {embeddings.dtype}")
+                final_embeddings = embeddings.to(self.dtype)
+                print(f"[MME LOG] Final embeddings shape: {final_embeddings.shape}, dtype: {final_embeddings.dtype}. EXITING __call__.")
+                return final_embeddings
+            except Exception as e:
+                print(f"❌ [MME LOG] FATAL ERROR during encoding {'text' if is_text else 'images'}: {e}")
+                traceback.print_exc()
+                return torch.empty(0, self.embedding_dim).to(self.device)
+# --- Test block (SigLIP) ---
+if __name__ == "__main__":
+    # This test now uses the encoder class exactly as you intend to.
+    MODEL_ID = "google/siglip-so400m-patch16-256-i18n"
+    print(f"\n--- MultiModalEncoder Test ({MODEL_ID}) ---")
+    texts = [
+        "Uranus", # Text 0
+        "Anus",   # Text 1
+        "Ass",    # Text 2
+        "Planet", # Text 3
+        "Dog"     # Text 4
+    ]
+    try:
+        img_urls = [
+            "https://pbs.twimg.com/media/G3ra9C8W0AAGR8V.jpg", # Image 0: Uranus meme pic
+        ]
+        headers = {"User-Agent": "Mozilla/5.0"}
+        images = [
+            Image.open(io.BytesIO(requests.get(u, headers=headers).content))
+            for u in img_urls
+        ]
+        size = 256 # Model's expected size
+        images.append(Image.new("RGB", (size, size), color="green")) # Image 1: Green Square
+        print(f"✅ Downloaded test image and created green square (size {size}x{size}).")
+    except Exception as e:
+        print(f"❌ Failed to load images: {e}")
+        traceback.print_exc()
+        exit()
+    try:
+        # 1. Initialize your encoder
+        encoder = MultiModalEncoder(model_id=MODEL_ID)
+        print("\n--- Encoding Texts (Separately) ---")
+        text_embeddings = encoder(texts) # Uses __call__
+        print(f"Shape: {text_embeddings.shape}")
+        print("\n--- Encoding Images (Separately) ---")
+        image_embeddings = encoder(images) # Uses __call__
+        print(f"Shape: {image_embeddings.shape}")
+        print("\n--- Similarity Check (Your Goal) ---")
+        # 2. Calculate Cosine Similarity
+        # This is just a dot product because the encoder __call__ method
+        # already normalized the vectors.
+        similarity_matrix = torch.matmul(image_embeddings.cpu(), text_embeddings.cpu().T).numpy()
+        np.set_printoptions(precision=4, suppress=True)
+        print("\nCosine Similarity matrix (image × text):")
+        # Row: Images (0: Uranus Pic, 1: Green)
+        # Col: Texts (0: Uranus, 1: Anus, 2: Ass, 3: Planet, 4: Dog)
+        print(similarity_matrix)
+        print("\nSpecific Similarity Scores (Cosine Similarity, -1.0 to 1.0):")
+        print(f"Image 0 (Uranus pic) vs Text 0 (Uranus): {similarity_matrix[0][0]:.4f}")
+        print(f"Image 0 (Uranus pic) vs Text 1 (Anus):   {similarity_matrix[0][1]:.4f}")
+        print(f"Image 0 (Uranus pic) vs Text 3 (Planet): {similarity_matrix[0][3]:.4f}")
+        print(f"Image 0 (Uranus pic) vs Text 4 (Dog):    {similarity_matrix[0][4]:.4f}")
+        print(f"Image 1 (Green) vs Text 4 (Dog):   {similarity_matrix[1][4]:.4f}")
+    except Exception as e:
+        print(f"\n--- An error occurred during the SigLIP test run ---")
+        print(f"Error: {e}")
+        traceback.print_exc()

models/ohlc_embedder.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List
+# --- Import vocabulary for the test block ---
+import models.vocabulary as vocab
+class OHLCEmbedder(nn.Module):
+    """
+    Embeds a sequence of Open and Close prices AND its interval.
+    FIXED: Now takes interval_ids as input and combines an
+    interval embedding with the 1D-CNN chart pattern features.
+    """
+    def __init__(
+        self,
+        # --- NEW: Interval vocab size ---
+        num_intervals: int,
+        input_channels: int = 2, # Open, Close
+        sequence_length: int = 300,
+        cnn_channels: List[int] = [16, 32, 64],
+        kernel_sizes: List[int] = [3, 3, 3],
+        # --- NEW: Interval embedding dim ---
+        interval_embed_dim: int = 32,
+        output_dim: int = 4096,
+        dtype: torch.dtype = torch.float16
+    ):
+        super().__init__()
+        assert len(cnn_channels) == len(kernel_sizes), "cnn_channels and kernel_sizes must have the same length"
+        self.dtype = dtype
+        self.sequence_length = sequence_length
+        self.cnn_layers = nn.ModuleList()
+        self.output_dim = output_dim
+        in_channels = input_channels
+        current_seq_len = sequence_length
+        for i, (out_channels, k_size) in enumerate(zip(cnn_channels, kernel_sizes)):
+            conv = nn.Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=k_size,
+                padding='same'
+            )
+            self.cnn_layers.append(conv)
+            pool = nn.MaxPool1d(kernel_size=2, stride=2)
+            self.cnn_layers.append(pool)
+            current_seq_len = current_seq_len // 2
+            self.cnn_layers.append(nn.ReLU())
+            in_channels = out_channels
+        self.global_pool = nn.AdaptiveAvgPool1d(1)
+        final_cnn_channels = cnn_channels[-1]
+        # --- NEW: Interval Embedding Layer ---
+        self.interval_embedding = nn.Embedding(num_intervals, interval_embed_dim, padding_idx=0)
+        # --- NEW: MLP input dim is (CNN features + Interval features) ---
+        mlp_input_dim = final_cnn_channels + interval_embed_dim
+        self.mlp = nn.Sequential(
+            nn.Linear(mlp_input_dim, mlp_input_dim * 2),
+            nn.GELU(),
+            nn.LayerNorm(mlp_input_dim * 2),
+            nn.Linear(mlp_input_dim * 2, output_dim),
+            nn.LayerNorm(output_dim)
+        )
+        self.to(dtype)
+    def forward(self, x: torch.Tensor, interval_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Batch of normalized OHLC sequences.
+                              Shape: [batch_size, 2, sequence_length]
+            interval_ids (torch.Tensor): Batch of interval IDs.
+                                         Shape: [batch_size]
+        Returns:
+            torch.Tensor: Batch of OHLC embeddings.
+                          Shape: [batch_size, output_dim]
+        """
+        if x.shape[1] != 2 or x.shape[2] != self.sequence_length:
+             raise ValueError(f"Input tensor shape mismatch. Expected [B, 2, {self.sequence_length}], got {x.shape}")
+        x = x.to(self.dtype)
+        # 1. Pass through CNN layers
+        for layer in self.cnn_layers:
+            x = layer(x)
+        # 2. Apply global average pooling
+        x = self.global_pool(x)
+        # 3. Flatten for MLP
+        x = x.squeeze(-1)
+        # Shape: [batch_size, final_cnn_channels]
+        # 4. --- NEW: Get interval embedding ---
+        interval_embed = self.interval_embedding(interval_ids)
+        # Shape: [batch_size, interval_embed_dim]
+        # 5. --- NEW: Combine features ---
+        combined = torch.cat([x, interval_embed], dim=1)
+        # Shape: [batch_size, final_cnn_channels + interval_embed_dim]
+        # 6. Pass through final MLP
+        x = self.mlp(combined)
+        # Shape: [batch_size, output_dim]
+        return x

models/token_encoder.py ADDED Viewed

	@@ -0,0 +1,182 @@

+# token_encoder.py (FIXED)
+import torch
+import torch.nn as nn
+from typing import List, Any
+from PIL import Image
+from models.multi_modal_processor import MultiModalEncoder
+from models.wallet_set_encoder import WalletSetEncoder # Using your set encoder
+from models.vocabulary import NUM_PROTOCOLS
+class TokenEncoder(nn.Module):
+    """
+    Encodes a token's core identity into a single <TokenVibeEmbedding>.
+    FIXED: This version uses a robust fusion architecture and provides
+    a dynamic, smaller output dimension (e.g., 2048) suitable for
+    being an input to a larger model.
+    """
+    def __init__(
+        self,
+        multi_dim: int, # NEW: Pass the dimension directly
+        output_dim: int = 2048,
+        internal_dim: int = 1024, # INCREASED: Better balance between bottleneck and capacity
+        protocol_embed_dim: int = 64,
+        vanity_embed_dim: int = 32, # NEW: Small embedding for the vanity flag
+        nhead: int = 4,
+        num_layers: int = 1,
+        dtype: torch.dtype = torch.float16
+    ):
+        """
+        Initializes the TokenEncoder.
+        Args:
+            siglip_dim (int): The embedding dimension of the multimodal encoder (e.g., 1152).
+            output_dim (int):
+                The final dimension of the <TokenVibeEmbedding> (e.g., 2048).
+            internal_dim (int):
+                The shared dimension for the internal fusion transformer (e.g., 1024).
+            protocol_embed_dim (int):
+                Small dimension for the protocol ID (e.g., 64).
+            vanity_embed_dim (int):
+                Small dimension for the is_vanity boolean flag.
+            nhead (int):
+                Attention heads for the fusion transformer.
+            num_layers (int):
+                Layers for the fusion transformer.
+            dtype (torch.dtype):
+                The data type (e.g., torch.float16).
+        """
+        super().__init__()
+        self.output_dim = output_dim
+        self.internal_dim = internal_dim
+        self.dtype = dtype
+        # Store SigLIP's fixed output dim (e.g., 1152)
+        self.multi_dim = multi_dim
+        # --- 1. Projection Layers ---
+        # Project all features to the *internal_dim*
+        self.name_proj = nn.Linear(self.multi_dim, internal_dim)
+        self.symbol_proj = nn.Linear(self.multi_dim, internal_dim)
+        self.image_proj = nn.Linear(self.multi_dim, internal_dim)
+        # --- 2. Categorical & Boolean Feature Embeddings ---
+        # Use small vocab size and small embed dim
+        self.protocol_embedding = nn.Embedding(NUM_PROTOCOLS, protocol_embed_dim)
+        # Project from small dim (64) up to internal_dim (1024)
+        self.protocol_proj = nn.Linear(protocol_embed_dim, internal_dim)
+        # NEW: Embedding for the is_vanity boolean flag
+        self.vanity_embedding = nn.Embedding(2, vanity_embed_dim) # 2 classes: True/False
+        self.vanity_proj = nn.Linear(vanity_embed_dim, internal_dim)
+        # --- 3. Fusion Encoder ---
+        # Re-use WalletSetEncoder to fuse the sequence of 5 features
+        self.fusion_transformer = WalletSetEncoder(
+            d_model=internal_dim,
+            nhead=nhead,
+            num_layers=num_layers,
+            dim_feedforward=internal_dim * 4, # Standard 4x
+            dtype=dtype
+        )
+        # --- 4. Final Output Projection ---
+        # Project from the transformer's output (internal_dim)
+        # to the final, dynamic output_dim.
+        self.final_projection = nn.Sequential(
+            nn.Linear(internal_dim, internal_dim * 2),
+            nn.GELU(),
+            nn.LayerNorm(internal_dim * 2),
+            nn.Linear(internal_dim * 2, output_dim),
+            nn.LayerNorm(output_dim)
+        )
+        # Cast new layers to the correct dtype and device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.to(device=device, dtype=dtype)
+    def forward(
+        self,
+        name_embeds: torch.Tensor,
+        symbol_embeds: torch.Tensor,
+        image_embeds: torch.Tensor,
+        protocol_ids: torch.Tensor,
+        is_vanity_flags: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Processes a batch of token data to create a batch of embeddings.
+        Args:
+            name_embeds (torch.Tensor): Pre-computed embeddings for token names. Shape: [B, siglip_dim]
+            symbol_embeds (torch.Tensor): Pre-computed embeddings for token symbols. Shape: [B, siglip_dim]
+            image_embeds (torch.Tensor): Pre-computed embeddings for token images. Shape: [B, siglip_dim]
+            protocol_ids (torch.Tensor): Batch of protocol IDs. Shape: [B]
+            is_vanity_flags (torch.Tensor): Batch of boolean flags for vanity addresses. Shape: [B]
+        Returns:
+            torch.Tensor: The final <TokenVibeEmbedding> batch.
+                          Shape: [batch_size, output_dim]
+        """
+        device = name_embeds.device
+        batch_size = name_embeds.shape[0]
+        # 2. Get Protocol embedding (small)
+        print(f"\n--- [TokenEncoder LOG] ENTERING FORWARD PASS (Batch Size: {batch_size}) ---")
+        print(f"[TokenEncoder LOG] Input protocol_ids (shape {protocol_ids.shape}):\n{protocol_ids}")
+        print(f"[TokenEncoder LOG] Protocol Embedding Vocab Size: {self.protocol_embedding.num_embeddings}")
+        protocol_ids_long = protocol_ids.to(device, dtype=torch.long)
+        protocol_emb_raw = self.protocol_embedding(protocol_ids_long) # [B, 64]
+        print(f"[TokenEncoder LOG] Raw protocol embeddings shape: {protocol_emb_raw.shape}")
+        # NEW: Get vanity embedding
+        vanity_ids_long = is_vanity_flags.to(device, dtype=torch.long)
+        vanity_emb_raw = self.vanity_embedding(vanity_ids_long) # [B, 32]
+        # 3. Project all features to internal_dim (e.g., 1024)
+        print(f"[TokenEncoder LOG] Projecting features to internal_dim: {self.internal_dim}")
+        name_emb = self.name_proj(name_embeds)
+        symbol_emb = self.symbol_proj(symbol_embeds)
+        image_emb = self.image_proj(image_embeds)
+        protocol_emb = self.protocol_proj(protocol_emb_raw)
+        vanity_emb = self.vanity_proj(vanity_emb_raw) # NEW
+        # 4. Stack all projected features into a sequence
+        feature_sequence = torch.stack([
+            name_emb,
+            symbol_emb,
+            image_emb,
+            protocol_emb,
+            vanity_emb, # NEW: Add the vanity embedding to the sequence
+        ], dim=1)
+        print(f"[TokenEncoder LOG] Stacked feature_sequence shape: {feature_sequence.shape}")
+        print(f"  - name_emb shape: {name_emb.shape}")
+        print(f"  - symbol_emb shape: {symbol_emb.shape}")
+        print(f"  - image_emb shape: {image_emb.shape}")
+        print(f"  - protocol_emb shape: {protocol_emb.shape}")
+        print(f"  - vanity_emb shape: {vanity_emb.shape}") # ADDED: Log the new vanity embedding shape
+        # 5. Create the padding mask (all False, since we have a fixed number of features for all)
+        padding_mask = torch.zeros(batch_size, feature_sequence.shape[1], device=device, dtype=torch.bool)
+        print(f"[TokenEncoder LOG] Created padding_mask of shape: {padding_mask.shape}")
+        # 6. Fuse the sequence with the Transformer Encoder
+        # This returns the [CLS] token output.
+        # Shape: [B, internal_dim]
+        fused_embedding = self.fusion_transformer(
+            item_embeds=feature_sequence,
+            src_key_padding_mask=padding_mask
+        )
+        print(f"[TokenEncoder LOG] Fused embedding shape after transformer: {fused_embedding.shape}")
+        # 7. Project to the final output dimension
+        # Shape: [B, output_dim]
+        token_vibe_embedding = self.final_projection(fused_embedding)
+        print(f"[TokenEncoder LOG] Final token_vibe_embedding shape: {token_vibe_embedding.shape}")
+        print(f"--- [TokenEncoder LOG] EXITING FORWARD PASS ---\n")
+        return token_vibe_embedding

models/vocabulary.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# vocabulary.py
+"""
+Defines the vocabulary and mappings for categorical features.
+"""
+# --- Event Type Mappings ---
+EVENT_NAMES = [
+    '__PAD__', 'Chart_Segment', 'Mint',
+    'Transfer', 'LargeTransfer',
+    'Trade',
+    'Deployer_Trade',
+    'SmartWallet_Trade',
+    'LargeTrade',
+    'PoolCreated',
+    'LiquidityChange',
+    'FeeCollected',
+    'TokenBurn',
+    'SupplyLock',
+    'OnChain_Snapshot',
+    'HolderSnapshot',
+    'TrendingToken',
+    'BoostedToken',
+    'XPost',
+    'XRetweet',
+    'XReply',
+    'XQuoteTweet',
+    'PumpReply',
+    'DexBoost_Paid',
+    'DexProfile_Updated',
+    'AlphaGroup_Call',
+    'Channel_Call',
+    'CexListing',
+    'TikTok_Trending_Hashtag',
+    'XTrending_Hashtag',
+    'ChainSnapshot',
+    'Lighthouse_Snapshot',
+    'Migrated',
+    'MIDDLE',
+    'RECENT'
+]
+EVENT_TO_ID = {name: i for i, name in enumerate(EVENT_NAMES)}
+ID_TO_EVENT = {i: name for i, name in enumerate(EVENT_NAMES)}
+NUM_EVENT_TYPES = len(EVENT_NAMES)
+# --- Protocol Mappings ---
+# The canonical list of protocol names
+PROTOCOL_NAMES = [
+    "Unknown",
+    "Pump V1",
+    "Pump AMM",
+    "Bonk",
+    "Raydium CPMM"
+]
+PROTOCOL_TO_ID = {name: i for i, name in enumerate(PROTOCOL_NAMES)}
+ID_TO_PROTOCOL = {i: name for i, name in enumerate(PROTOCOL_NAMES)}
+NUM_PROTOCOLS = len(PROTOCOL_NAMES)
+# --- Neo4J Link Type Mappings ---
+# UPDATED: Added link types from your Neo4j schema
+LINK_TYPES = [
+    "TransferLink",
+    "TransferLinkToken",
+    "BundleTradeLink",
+    "CopiedTradeLink",
+    "CoordinatedActivityLink",
+    "MintedLink",
+    "SnipedLink",
+    "LockedSupplyLink",
+    "BurnedLink",
+    "ProvidedLiquidityLink",
+    "WhaleOfLink",
+    "TopTraderOfLink",
+]
+LINK_TYPE_TO_ID = {name: i for i, name in enumerate(LINK_TYPES)}
+ID_TO_LINK_TYPE = {i: name for i, name in enumerate(LINK_TYPES)}
+NUM_LINK_TYPES = len(LINK_TYPES)
+LINK_NAME_TO_TRIPLET = {
+    # Wallet <-> Wallet Links
+    "TransferLink": ('wallet', 'TransferLink', 'wallet'),
+    "BundleTradeLink": ('wallet', 'BundleTradeLink', 'wallet'),
+    "CopiedTradeLink": ('wallet', 'CopiedTradeLink', 'wallet'),
+    "CoordinatedActivityLink": ('wallet', 'CoordinatedActivityLink', 'wallet'),
+    # Wallet -> Token Links
+    "TransferLinkToken": ('wallet', 'TransferLinkToken', 'token'),
+    "MintedLink": ('wallet', 'MintedLink', 'token'),
+    "SnipedLink": ('wallet', 'SnipedLink', 'token'),
+    "LockedSupplyLink": ('wallet', 'LockedSupplyLink', 'token'),
+    "BurnedLink": ('wallet', 'BurnedLink', 'token'),
+    "ProvidedLiquidityLink": ('wallet', 'ProvidedLiquidityLink', 'token'),
+    "WhaleOfLink": ('wallet', 'WhaleOfLink', 'token'),
+    "TopTraderOfLink": ('wallet', 'TopTraderOfLink', 'token'),
+}
+# --- NEW: OHLC Interval Mappings ---
+OHLC_INTERVALS = [
+    "Unknown", # ID 0
+    "1s",      # ID 1
+    "30s",     # ID 2
+]
+INTERVAL_TO_ID = {name: i for i, name in enumerate(OHLC_INTERVALS)}
+ID_TO_INTERVAL = {i: name for i, name in enumerate(OHLC_INTERVALS)}
+NUM_OHLC_INTERVALS = len(OHLC_INTERVALS)
+DEX_NAMES = [
+    "Unknown",
+    "Axiom",
+    "Bullx",
+    "OXK",
+    "Trojan",
+    "Jupyter"
+]
+DEX_TO_ID  = {name: i for i, name in enumerate(DEX_NAMES)}
+ID_TO_DEX  = {i: name for i, name in enumerate(DEX_NAMES)}
+NUM_DEX_PLATFORMS = len(DEX_NAMES)
+# --- NEW: Trending List Source Mappings ---
+TRENDING_LIST_SOURCES = [
+    "Unknown",
+    "Phantom",
+    "Dexscreener"
+]
+TRENDING_LIST_SOURCE_TO_ID = {name: i for i, name in enumerate(TRENDING_LIST_SOURCES)}
+ID_TO_TRENDING_LIST_SOURCE = {i: name for i, name in enumerate(TRENDING_LIST_SOURCES)}
+NUM_TRENDING_LIST_SOURCES = len(TRENDING_LIST_SOURCES)
+# --- NEW: Trending List Timeframe Mappings ---
+TRENDING_LIST_TIMEFRAMES = [
+    "Unknown",
+    "5m",
+    "1h",
+    "24h"
+]
+TRENDING_LIST_TIMEFRAME_TO_ID = {name: i for i, name in enumerate(TRENDING_LIST_TIMEFRAMES)}
+ID_TO_TRENDING_LIST_TIMEFRAME = {i: name for i, name in enumerate(TRENDING_LIST_TIMEFRAMES)}
+NUM_TRENDING_LIST_TIMEFRAMES = len(TRENDING_LIST_TIMEFRAMES)
+# --- NEW: Lighthouse Snapshot Timeframe Mappings ---
+LIGHTHOUSE_TIMEFRAMES = [
+    "Unknown",
+    "5m",
+    "1h",
+    "6h",
+    "24h"
+]
+LIGHTHOUSE_TIMEFRAME_TO_ID = {name: i for i, name in enumerate(LIGHTHOUSE_TIMEFRAMES)}
+NUM_LIGHTHOUSE_TIMEFRAMES = len(LIGHTHOUSE_TIMEFRAMES)
+# --- NEW: TrackerEncoder Vocabularies ---
+# Alpha Groups (Discord)
+ALPHA_GROUPS = [
+    "unknown",
+    "Potion",
+    "Serenity",
+    "Digi World"
+]
+ALPHA_GROUPS_TO_ID = {name: i for i, name in enumerate(ALPHA_GROUPS)}
+ID_TO_ALPHA_GROUPS = {i: name for i, name in enumerate(ALPHA_GROUPS)}
+NUM_ALPHA_GROUPS = len(ALPHA_GROUPS)
+# Call Channels (Telegram)
+CALL_CHANNELS = [
+    "unknown",
+    "MarcosCalls",
+    "kobecalls",
+    "DEGEMSCALLS"
+]
+CALL_CHANNELS_TO_ID = {name: i for i, name in enumerate(CALL_CHANNELS)}
+ID_TO_CALL_CHANNELS = {i: name for i, name in enumerate(CALL_CHANNELS)}
+NUM_CALL_CHANNELS = len(CALL_CHANNELS)
+# CEX Exchanges
+EXCHANGES = [
+    "unknown", "mexc", "weex", "binance", "kraken"
+]
+EXCHANGES_TO_ID = {name: i for i, name in enumerate(EXCHANGES)}
+ID_TO_EXCHANGES = {i: name for i, name in enumerate(EXCHANGES)}
+NUM_EXCHANGES = len(EXCHANGES)

models/wallet_encoder.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Dict, Any, Optional
+from PIL import Image
+# We assume these helper modules are in the same directory
+from models.multi_modal_processor import MultiModalEncoder
+from models.wallet_set_encoder import WalletSetEncoder
+class WalletEncoder(nn.Module):
+    """
+    Encodes a wallet's full identity into a single <WalletEmbedding>.
+    UPDATED: Aligned with the final feature spec.
+    """
+    def __init__(
+        self,
+        encoder: MultiModalEncoder ,
+        d_model: int = 2048, # Standardized to d_model
+        token_vibe_dim: int = 2048, # Expects token vibe of d_model
+        set_encoder_nhead: int = 8,
+        set_encoder_nlayers: int = 2,
+        dtype: torch.dtype = torch.float16
+    ):
+        """
+        Initializes the WalletEncoder.
+        Args:
+            d_model (int): The final output dimension (e.g., 4096).
+            token_vibe_dim (int): The dimension of the pre-computed
+                                  <TokenVibeEmbedding> (e.g., 1024).
+            encoder (MultiModalEncoder): Instantiated SigLIP encoder.
+            time_encoder (ContextualTimeEncoder): Instantiated time encoder.
+            set_encoder_nhead (int): Attention heads for set encoders.
+            set_encoder_nlayers (int): Transformer layers for set encoders.
+            dtype (torch.dtype): Data type.
+        """
+        super().__init__()
+        self.d_model = d_model
+        self.dtype = dtype
+        self.encoder = encoder
+        # --- Dimensions ---
+        self.token_vibe_dim = token_vibe_dim
+        self.mmp_dim = self.encoder.embedding_dim # 1152
+        # === 1. Profile Encoder (FIXED) ===
+        # 1 age + 5 deployer_stats + 1 balance + 4 lifetime_counts +
+        # 3 lifetime_trading + 12 1d_stats + 12 7d_stats = 38
+        self.profile_numerical_features = 38
+        self.profile_num_norm = nn.LayerNorm(self.profile_numerical_features)
+        # FIXED: Input dim no longer has bool embed or deployed tokens embed
+        profile_mlp_in_dim = self.profile_numerical_features # 38
+        self.profile_encoder_mlp = self._build_mlp(profile_mlp_in_dim, d_model)
+        # === 2. Social Encoder (FIXED) ===
+        # 4 booleans: has_pf, has_twitter, has_telegram, is_exchange_wallet
+        self.social_bool_embed = nn.Embedding(2, 16)
+        # FIXED: Input dim is (4 * 16) + mmp_dim
+        social_mlp_in_dim = (16 * 4) + self.mmp_dim # username embed
+        self.social_encoder_mlp = self._build_mlp(social_mlp_in_dim, d_model)
+        # === 3. Holdings Encoder (FIXED) ===
+        # 11 original stats + 1 holding_time = 12
+        self.holding_numerical_features = 12
+        self.holding_num_norm = nn.LayerNorm(self.holding_numerical_features)
+        # FIXED: Input dim no longer uses time_encoder
+        holding_row_in_dim = (
+            self.token_vibe_dim +            # <TokenVibeEmbedding>
+            self.holding_numerical_features  # 12
+        )
+        self.holding_row_encoder_mlp = self._build_mlp(holding_row_in_dim, d_model)
+        self.holdings_set_encoder = WalletSetEncoder(
+            d_model, set_encoder_nhead, set_encoder_nlayers, dtype=dtype
+        )
+        # === 5. Final Fusion Encoder (Unchanged) ===
+        # Still fuses 4 components: Profile, Social, Holdings, Graph
+        self.fusion_mlp = nn.Sequential(
+            nn.Linear(d_model * 3, d_model * 2), # Input is d_model * 3
+            nn.GELU(),
+            nn.LayerNorm(d_model * 2),
+            nn.Linear(d_model * 2, d_model),
+            nn.LayerNorm(d_model)
+        )
+        self.to(dtype)
+    def _build_mlp(self, in_dim, out_dim):
+        return nn.Sequential(
+            nn.Linear(in_dim, out_dim * 2),
+            nn.GELU(),
+            nn.LayerNorm(out_dim * 2),
+            nn.Linear(out_dim * 2, out_dim),
+        ).to(self.dtype)
+    def _safe_signed_log(self, x: torch.Tensor) -> torch.Tensor:
+        # Log-normalizes numerical features (like age, stats, etc.)
+        return torch.sign(x) * torch.log1p(torch.abs(x))
+    def _get_device(self) -> torch.device:
+        return self.encoder.device
+    def forward(
+        self,
+        profile_rows: List[Dict[str, Any]],
+        social_rows: List[Dict[str, Any]],
+        holdings_batch: List[List[Dict[str, Any]]],
+        token_vibe_lookup: Dict[str, torch.Tensor],
+        embedding_pool: torch.Tensor,
+        username_embed_indices: torch.Tensor
+    ) -> torch.Tensor:
+        device = self._get_device()
+        profile_embed = self._encode_profile_batch(profile_rows, device)
+        social_embed = self._encode_social_batch(social_rows, embedding_pool, username_embed_indices, device)
+        holdings_embed = self._encode_holdings_batch(holdings_batch, token_vibe_lookup, device)
+        fused = torch.cat([profile_embed, social_embed, holdings_embed], dim=1)
+        return self.fusion_mlp(fused)
+    def _encode_profile_batch(self, profile_rows, device):
+        batch_size = len(profile_rows)
+        # FIXED: 38 numerical features
+        num_tensor = torch.zeros(batch_size, self.profile_numerical_features, device=device, dtype=self.dtype)
+        # bool_tensor removed
+        # time_tensor removed
+        for i, row in enumerate(profile_rows):
+            # A: Numerical (FIXED: 38 features, MUST be present)
+            num_data = [
+                # 1. Age
+                row.get('age', 0.0),
+                # 2. Deployed Token Aggregates (5)
+                row.get('deployed_tokens_count', 0.0),
+                row.get('deployed_tokens_migrated_pct', 0.0),
+                row.get('deployed_tokens_avg_lifetime_sec', 0.0),
+                row.get('deployed_tokens_avg_peak_mc_usd', 0.0),
+                row.get('deployed_tokens_median_peak_mc_usd', 0.0),
+                # 3. Balance (1)
+                row.get('balance', 0.0),
+                # 4. Lifetime Transaction Counts (4)
+                row.get('transfers_in_count', 0.0), row.get('transfers_out_count', 0.0),
+                row.get('spl_transfers_in_count', 0.0), row.get('spl_transfers_out_count', 0.0),
+                # 5. Lifetime Trading Stats (3)
+                row.get('total_buys_count', 0.0), row.get('total_sells_count', 0.0),
+                row.get('total_winrate', 0.0),
+                # 6. 1-Day Stats (12)
+                row.get('stats_1d_realized_profit_sol', 0.0), row.get('stats_1d_realized_profit_pnl', 0.0),
+                row.get('stats_1d_buy_count', 0.0), row.get('stats_1d_sell_count', 0.0),
+                row.get('stats_1d_transfer_in_count', 0.0), row.get('stats_1d_transfer_out_count', 0.0),
+                row.get('stats_1d_avg_holding_period', 0.0), row.get('stats_1d_total_bought_cost_sol', 0.0),
+                row.get('stats_1d_total_sold_income_sol', 0.0), row.get('stats_1d_total_fee', 0.0),
+                row.get('stats_1d_winrate', 0.0), row.get('stats_1d_tokens_traded', 0.0),
+                # 7. 7-Day Stats (12)
+                row.get('stats_7d_realized_profit_sol', 0.0), row.get('stats_7d_realized_profit_pnl', 0.0),
+                row.get('stats_7d_buy_count', 0.0), row.get('stats_7d_sell_count', 0.0),
+                row.get('stats_7d_transfer_in_count', 0.0), row.get('stats_7d_transfer_out_count', 0.0),
+                row.get('stats_7d_avg_holding_period', 0.0), row.get('stats_7d_total_bought_cost_sol', 0.0),
+                row.get('stats_7d_total_sold_income_sol', 0.0), row.get('stats_7d_total_fee', 0.0),
+                row.get('stats_7d_winrate', 0.0), row.get('stats_7d_tokens_traded', 0.0),
+            ]
+            num_tensor[i] = torch.tensor(num_data, dtype=self.dtype)
+            # C: Booleans and deployed_tokens lists are GONE
+        # Log-normalize all numerical features (age, stats, etc.)
+        num_embed = self.profile_num_norm(self._safe_signed_log(num_tensor))
+        # The profile fused tensor is now just the numerical embeddings
+        profile_fused = num_embed
+        return self.profile_encoder_mlp(profile_fused)
+    def _encode_social_batch(self, social_rows, embedding_pool, username_embed_indices, device):
+        batch_size = len(social_rows)
+        # FIXED: 4 boolean features
+        bool_tensor = torch.zeros(batch_size, 4, device=device, dtype=torch.long)
+        for i, row in enumerate(social_rows):
+            # All features MUST be present
+            bool_tensor[i, 0] = 1 if row['has_pf_profile'] else 0
+            bool_tensor[i, 1] = 1 if row['has_twitter'] else 0
+            bool_tensor[i, 2] = 1 if row['has_telegram'] else 0
+            # FIXED: Added is_exchange_wallet
+            bool_tensor[i, 3] = 1 if row['is_exchange_wallet'] else 0
+        bool_embeds = self.social_bool_embed(bool_tensor).view(batch_size, -1) # [B, 64]
+        # --- NEW: Look up pre-computed username embeddings ---
+        # --- FIXED: Handle case where embedding_pool is empty ---
+        if embedding_pool.numel() > 0:
+            # SAFETY: build a padded view so missing indices (-1) map to a zero vector
+            pad_row = torch.zeros(1, embedding_pool.size(1), device=device, dtype=embedding_pool.dtype)
+            pool_padded = torch.cat([pad_row, embedding_pool], dim=0)
+            shifted_idx = torch.where(username_embed_indices >= 0, username_embed_indices + 1, torch.zeros_like(username_embed_indices))
+            username_embed = F.embedding(shifted_idx, pool_padded)
+        else:
+            # If there are no embeddings, create a zero tensor of the correct shape
+            username_embed = torch.zeros(batch_size, self.mmp_dim, device=device, dtype=self.dtype)
+        social_fused = torch.cat([bool_embeds, username_embed], dim=1)
+        return self.social_encoder_mlp(social_fused)
+    def _encode_holdings_batch(self, holdings_batch, token_vibe_lookup, device):
+        batch_size = len(holdings_batch)
+        max_len = max(len(h) for h in holdings_batch) if any(holdings_batch) else 1
+        seq_embeds = torch.zeros(batch_size, max_len, self.d_model, device=device, dtype=self.dtype)
+        mask = torch.ones(batch_size, max_len, device=device, dtype=torch.bool)
+        default_vibe = torch.zeros(self.token_vibe_dim, device=device, dtype=self.dtype)
+        for i, holdings in enumerate(holdings_batch):
+            if not holdings: continue
+            h_len = min(len(holdings), max_len)
+            holdings = holdings[:h_len]
+            # --- FIXED: Safely get vibes, using default if mint_address is missing or not in lookup ---
+            vibes = [token_vibe_lookup.get(row['mint_address'], default_vibe) for row in holdings if 'mint_address' in row]
+            if not vibes: continue # Skip if no valid holdings with vibes
+            vibe_tensor = torch.stack(vibes)
+            # time_tensor removed
+            num_data_list = []
+            for row in holdings:
+                # FIXED: All 12 numerical features MUST be present
+                num_data = [
+                    # Use .get() with a 0.0 default for safety
+                    row.get('holding_time', 0.0),
+                    row.get('balance_pct_to_supply', 0.0),
+                    row.get('history_bought_cost_sol', 0.0), # Corrected key from schema
+                    row.get('bought_amount_sol_pct_to_native_balance', 0.0), # This key is not in schema, will default to 0
+                    row.get('history_total_buys', 0.0),
+                    row.get('history_total_sells', 0.0),
+                    row.get('realized_profit_pnl', 0.0),
+                    row.get('realized_profit_sol', 0.0),
+                    row.get('history_transfer_in', 0.0),
+                    row.get('history_transfer_out', 0.0),
+                    row.get('avarage_trade_gap_seconds', 0.0),
+                    row.get('total_fees', 0.0) # Corrected key from schema
+                ]
+                num_data_list.append(num_data)
+            num_tensor = torch.tensor(num_data_list, device=device, dtype=self.dtype)
+            # Log-normalize all numerical features (holding_time, stats, etc.)
+            num_embed = self.holding_num_norm(self._safe_signed_log(num_tensor))
+            # time_embed removed
+            # FIXED: Fused tensor no longer has time_embed
+            fused_rows = torch.cat([vibe_tensor, num_embed], dim=1)
+            encoded_rows = self.holding_row_encoder_mlp(fused_rows)
+            seq_embeds[i, :h_len] = encoded_rows
+            mask[i, :h_len] = False
+        return self.holdings_set_encoder(seq_embeds, mask)

models/wallet_set_encoder.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torch.nn as nn
+class WalletSetEncoder(nn.Module):
+    """
+    Encodes a variable-length set of embeddings into a single fixed-size vector
+    using a Transformer encoder and a [CLS] token.
+    This is used to pool:
+    1. A wallet's `wallet_holdings` (a set of [holding_embeds]).
+    2. A wallet's `Neo4J links` (a set of [link_embeds]).
+    3. A wallet's `deployed_tokens` (a set of [token_name_embeds]).
+    """
+    def __init__(
+        self,
+        d_model: int,
+        nhead: int,
+        num_layers: int,
+        dim_feedforward: int = 2048,
+        dropout: float = 0.1,
+        dtype: torch.dtype = torch.float16
+    ):
+        """
+        Initializes the Set Encoder.
+        Args:
+            d_model (int): The input/output dimension of the embeddings.
+            nhead (int): Number of attention heads.
+            num_layers (int): Number of transformer layers.
+            dim_feedforward (int): Hidden dimension of the feedforward network.
+            dropout (float): Dropout rate.
+            dtype (torch.dtype): Data type.
+        """
+        super().__init__()
+        self.d_model = d_model
+        self.dtype = dtype
+        # The learnable [CLS] token, which will aggregate the set representation
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
+        nn.init.normal_(self.cls_token, std=0.02)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=dim_feedforward,
+            dropout=dropout,
+            batch_first=True
+        )
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layer,
+            num_layers=num_layers
+        )
+        self.output_norm = nn.LayerNorm(d_model)
+        self.to(dtype)
+    def forward(
+        self,
+        item_embeds: torch.Tensor,
+        src_key_padding_mask: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Forward pass.
+        Args:
+            item_embeds (torch.Tensor):
+                The batch of item embeddings.
+                Shape: [batch_size, seq_len, d_model]
+            src_key_padding_mask (torch.Tensor):
+                The boolean padding mask for the items, where True indicates
+                a padded position that should be ignored.
+                Shape: [batch_size, seq_len]
+        Returns:
+            torch.Tensor: The pooled set embedding.
+                          Shape: [batch_size, d_model]
+        """
+        batch_size = item_embeds.shape[0]
+        # 1. Create [CLS] token batch and concatenate with item embeddings
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1).to(self.dtype)
+        x = torch.cat([cls_tokens, item_embeds], dim=1)
+        # 2. Create the mask for the [CLS] token (it is never masked)
+        cls_mask = torch.zeros(batch_size, 1, device=src_key_padding_mask.device, dtype=torch.bool)
+        # 3. Concatenate the [CLS] mask with the item mask
+        full_padding_mask = torch.cat([cls_mask, src_key_padding_mask], dim=1)
+        # 4. Pass through Transformer
+        transformer_output = self.transformer_encoder(
+            x,
+            src_key_padding_mask=full_padding_mask
+        )
+        # 5. Extract the output of the [CLS] token (the first token in the sequence)
+        cls_output = transformer_output[:, 0, :]
+        return self.output_norm(cls_output)

neo4j.rs ADDED Viewed

	@@ -0,0 +1,121 @@

+// Nodes
+pub struct Token {
+    address: String,
+}
+pub struct Wallet {
+    address: String,
+}
+// Links
+/// Tracks direct capital flow and identifies funding chains.
+pub struct TransferLink {
+    pub signature: String,
+    pub source: String,
+    pub destination: String,
+    pub mint: String,
+    pub timestamp: i64,
+}
+/// Identifies wallets trading the same token in the same slot.
+pub struct BundleTradeLink {
+    pub signatures: Vec<String>,
+    pub wallet_a: String,
+    pub wallet_b: String,
+    pub mint: String,
+    pub slot: i64,
+    pub timestamp: i64,
+}
+/// Reveals a behavioral pattern of one wallet mirroring another's successful trade.
+pub struct CopiedTradeLink {
+    pub leader_buy_sig: String,
+    pub leader_sell_sig: String,
+    pub follower_buy_sig: String,
+    pub follower_sell_sig: String,
+    pub follower: String,
+    pub leader: String,
+    pub mint: String,
+    pub time_gap_on_buy_sec: i64,
+    pub time_gap_on_sell_sec: i64,
+    pub leader_pnl: f64,
+    pub follower_pnl: f64,
+    pub leader_buy_total: f64,
+    pub leader_sell_total: f64,
+    pub follower_buy_total: f64,
+    pub follower_sell_total: f64,
+    pub follower_buy_slippage: f32,
+    pub follower_sell_slippage: f32,
+}
+/// Represents a link where a group of wallets re-engage with a token in a coordinated manner.
+pub struct CoordinatedActivityLink {
+    pub leader_first_sig: String,
+    pub leader_second_sig: String,
+    pub follower_first_sig: String,
+    pub follower_second_sig: String,
+    pub follower: String,
+    pub leader: String,
+    pub mint: String,
+    pub time_gap_on_first_sec: i64,
+    pub time_gap_on_second_sec: i64,
+}
+/// Links a token to its original creator.
+pub struct MintedLink {
+    pub signature: String,
+    pub timestamp: i64,
+    pub buy_amount: f64,
+}
+/// Connects a token to its successful first-movers.
+pub struct SnipedLink {
+    pub signature: String,
+    pub rank: i64,
+    pub sniped_amount: f64,
+}
+/// Represents connection between wallet that locked supply.
+pub struct LockedSupplyLink {
+    pub signature: String,
+    pub amount: f64,
+    pub unlock_timestamp: u64,
+}
+/// link of the   wallet that burned tokens.
+pub struct BurnedLink {
+    pub signature: String,
+    pub amount: f64,
+    pub timestamp: i64,
+}
+/// Identifies wallets that provided liquidity, signaling high conviction.
+pub struct ProvidedLiquidityLink {
+    pub signature: String,
+    pub wallet: String,
+    pub token: String,
+    pub pool_address: String,
+    pub amount_base: f64,
+    pub amount_quote: f64,
+    pub timestamp: i64,
+}
+/// A derived link connecting a token to its largest holders.
+pub struct WhaleOfLink {
+    pub wallet: String,
+    pub token: String,
+    pub holding_pct_at_creation: f32, // Holding % when the link was made
+    pub ath_usd_at_creation: f64,     // Token's ATH when the link was made
+}
+/// A derived link connecting a token to its most profitable traders.
+pub struct TopTraderOfLink {
+    pub wallet: String,
+    pub token: String,
+    pub pnl_at_creation: f64,     // The PNL that first triggered the link
+    pub ath_usd_at_creation: f64, // Token's ATH when the link was made
+}

ohlc_stats.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f56037cf2ad8502213ee2c8470c314eef83a4cd93063290581ef45fadea5d48
+size 1660

onchain.sql ADDED Viewed

	@@ -0,0 +1,599 @@

+CREATE TABLE trades
+(
+    timestamp           DateTime('UTC'),
+    signature           String,
+    slot                UInt64,
+    transaction_index   UInt32,
+    instruction_index   UInt16,
+    success             Boolean,
+    error               Nullable(String),
+    -- Fee Structure
+    priority_fee        Float64,
+    bribe_fee           Float64,
+    coin_creator_fee    Float64,
+    mev_protection      UInt8,
+    -- Parties
+    maker               String,
+    -- Balances (Pre & Post)
+    base_balance    Float64,
+    quote_balance   Float64,
+    -- Trade Semantics
+    trade_type          UInt8,
+    protocol            UInt8,
+    platform            UInt8,
+    -- Asset Info
+    pool_address        String,
+    base_address        String,
+    quote_address       String,
+    -- Trade Details
+    slippage            Float32,
+    price_impact        Float32,
+    base_amount         UInt64,
+    quote_amount        UInt64,
+    price               Float64,
+    price_usd           Float64,
+    total               Float64,
+    total_usd           Float64
+)
+ENGINE = MergeTree()
+ORDER BY (base_address, timestamp, maker, signature);
+--- mint
+CREATE TABLE mints
+(
+    -- === Transaction Details ===
+    -- Solana signature is usually 88 characters, but we use String for flexibility.
+    signature String,
+    -- Converted to DateTime for easier time-based operations in ClickHouse.
+    timestamp DateTime('UTC'),
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- === Protocol & Platform ===
+    -- Protocol codes: 0=Unknown, 1=PumpFunLaunchpad, 2=RaydiumLaunchpad,
+    -- 3=PumpFunAMM, 4=RaydiumCPMM, 5=MeteoraBonding
+    protocol UInt8,
+    -- === Mint & Pool Details ===
+    mint_address String,
+    creator_address String,
+    pool_address String,
+    -- === Liquidity Details ===
+    initial_base_liquidity UInt64,
+    initial_quote_liquidity UInt64,
+    -- === Token Metadata ===
+    token_name Nullable(String),
+    token_symbol Nullable(String),
+    token_uri Nullable(String),
+    token_decimals UInt8,
+    total_supply UInt64,
+    is_mutable                              Boolean,
+    update_authority                        Nullable(String),
+    mint_authority                          Nullable(String),
+    freeze_authority                        Nullable(String),
+)
+ENGINE = MergeTree()
+ORDER BY (timestamp, creator_address, mint_address);
+CREATE TABLE migrations
+(
+    -- Transaction Details
+    timestamp           DateTime('UTC'),
+    signature String,
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- Protocol & Platform
+    protocol UInt8,
+    -- Migration Details
+    mint_address String,
+    virtual_pool_address String,
+    pool_address String,
+    -- Liquidity Details
+    migrated_base_liquidity Nullable(UInt64),
+    migrated_quote_liquidity Nullable(UInt64)
+)
+ENGINE = MergeTree()
+ORDER BY (mint_address, virtual_pool_address, pool_address, timestamp);
+CREATE TABLE fee_collections
+(
+    -- Transaction Details
+    timestamp           DateTime('UTC'),
+    signature String,
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- Protocol & Platform
+    protocol UInt8,
+    -- Fee Details
+    vault_address String,
+    recipient_address String,
+    -- Collected Amounts
+    token_0_mint_address String,
+    token_0_amount  Float64,
+    token_1_mint_address Nullable(String),
+    token_1_amount Nullable(Float64)
+)
+ENGINE = MergeTree()
+ORDER BY (vault_address, recipient_address, timestamp);
+CREATE TABLE liquidity
+(
+      -- Transaction Details --
+    signature String,
+    timestamp DateTime('UTC'),
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- Protocol Info --
+    protocol UInt8,
+    -- LP Action Details --
+    change_type UInt8,
+    lp_provider String,
+    pool_address String,
+    -- Token Amounts --
+    base_amount UInt64,
+    quote_amount UInt64
+)
+ENGINE = MergeTree()
+ORDER BY (timestamp, pool_address, lp_provider);
+CREATE TABLE pool_creations (
+    -- Transaction Details --
+    signature String,
+    timestamp Datetime('UTC'),
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- Protocol Info --
+    protocol UInt8,
+    -- Pool & Token Details --
+    creator_address String,
+    pool_address String,
+    base_address String,
+    quote_address String,
+    lp_token_address String,
+    -- Optional Initial State --
+    initial_base_liquidity Nullable(UInt64),
+    initial_quote_liquidity Nullable(UInt64),
+    base_decimals Nullable(UInt8),
+    quote_decimals Nullable(UInt8)
+)
+ENGINE = MergeTree()
+ORDER BY (base_address, creator_address);
+CREATE TABLE transfers
+(
+    -- Transaction Details
+    timestamp           DateTime('UTC'),
+    signature String,
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- Transfer Details
+    source String,
+    destination String,
+    -- Amount & Mint Details
+    mint_address String,
+    amount UInt64,
+    amount_decimal Float64,
+    -- Balance Context ===
+    source_balance Float64,
+    destination_balance Float64
+)
+ENGINE = MergeTree()
+ORDER BY (source, destination, mint_address, timestamp);
+CREATE TABLE supply_locks
+(
+    -- === Transaction Details ===
+    timestamp           DateTime('UTC'),
+    signature String,
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- === Protocol Info ===
+    protocol UInt8,
+    -- === Vesting Details ===
+    contract_address String,
+    sender String,
+    recipient String,
+    mint_address String,
+    total_locked_amount     Float64,
+    final_unlock_timestamp  UInt64
+)
+ENGINE = MergeTree()
+ORDER BY (timestamp, mint_address, sender, recipient);
+CREATE TABLE supply_lock_actions
+(
+    -- === Transaction Details ===
+    signature String,
+    timestamp           DateTime('UTC'),
+    slot UInt64,
+    success     Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- === Protocol Info ===
+    protocol UInt8,
+    -- === Action Details ===
+    action_type UInt8, -- e.g., 0 for Withdraw, 1 for Topup
+    contract_address String,
+    user String,
+    mint_address String,
+    amount Float64
+)
+ENGINE = MergeTree()
+ORDER BY (timestamp, mint_address, user);
+CREATE TABLE burns
+(
+    -- Transaction Details
+    timestamp DateTime('UTC'),
+    signature String,
+    slot UInt64,
+    success Boolean,
+    error Nullable(String),
+    priority_fee Float64,
+    -- Burn Details
+    mint_address String,
+    source String,
+    amount UInt64,
+    amount_decimal Float64,
+    source_balance Float64
+)
+ENGINE = MergeTree()
+ORDER BY (mint_address, source, timestamp);
+-------- Wallet schema
+CREATE TABLE wallet_profiles
+(
+    updated_at DateTime('UTC'),
+    first_seen_ts DateTime('UTC'),
+    last_seen_ts DateTime('UTC'),
+    wallet_address                  String,
+    tags                            Array(String),
+    deployed_tokens                 Array(String),
+    funded_from                     String,
+    funded_timestamp                UInt32,
+    funded_signature                String,
+    funded_amount                   Float64
+)
+ENGINE = ReplacingMergeTree(updated_at)
+PRIMARY KEY (wallet_address)
+ORDER BY (wallet_address);
+CREATE TABLE wallet_profile_metrics
+(
+    updated_at DateTime('UTC'),
+    wallet_address                  String,
+    balance                         Float64,
+    transfers_in_count              UInt32,
+    transfers_out_count             UInt32,
+    spl_transfers_in_count          UInt32,
+    spl_transfers_out_count         UInt32,
+    total_buys_count                UInt32,
+    total_sells_count               UInt32,
+    total_winrate                   Float32,
+    stats_1d_realized_profit_sol    Float64,
+    stats_1d_realized_profit_usd    Float64,
+    stats_1d_realized_profit_pnl    Float32,
+    stats_1d_buy_count              UInt32,
+    stats_1d_sell_count             UInt32,
+    stats_1d_transfer_in_count      UInt32,
+    stats_1d_transfer_out_count     UInt32,
+    stats_1d_avg_holding_period     Float32,
+    stats_1d_total_bought_cost_sol  Float64,
+    stats_1d_total_bought_cost_usd  Float64,
+    stats_1d_total_sold_income_sol  Float64,
+    stats_1d_total_sold_income_usd  Float64,
+    stats_1d_total_fee              Float64,
+    stats_1d_winrate                Float32,
+    stats_1d_tokens_traded          UInt32,
+    stats_7d_realized_profit_sol    Float64,
+    stats_7d_realized_profit_usd    Float64,
+    stats_7d_realized_profit_pnl    Float32,
+    stats_7d_buy_count              UInt32,
+    stats_7d_sell_count             UInt32,
+    stats_7d_transfer_in_count      UInt32,
+    stats_7d_transfer_out_count     UInt32,
+    stats_7d_avg_holding_period     Float32,
+    stats_7d_total_bought_cost_sol  Float64,
+    stats_7d_total_bought_cost_usd  Float64,
+    stats_7d_total_sold_income_sol  Float64,
+    stats_7d_total_sold_income_usd  Float64,
+    stats_7d_total_fee              Float64,
+    stats_7d_winrate                Float32,
+    stats_7d_tokens_traded          UInt32,
+    stats_30d_realized_profit_sol   Float64,
+    stats_30d_realized_profit_usd   Float64,
+    stats_30d_realized_profit_pnl   Float32,
+    stats_30d_buy_count             UInt32,
+    stats_30d_sell_count            UInt32,
+    stats_30d_transfer_in_count     UInt32,
+    stats_30d_transfer_out_count    UInt32,
+    stats_30d_avg_holding_period    Float32,
+    stats_30d_total_bought_cost_sol Float64,
+    stats_30d_total_bought_cost_usd Float64,
+    stats_30d_total_sold_income_sol Float64,
+    stats_30d_total_sold_income_usd Float64,
+    stats_30d_total_fee             Float64,
+    stats_30d_winrate               Float32,
+    stats_30d_tokens_traded         UInt32
+)
+ENGINE = MergeTree
+ORDER BY (wallet_address, updated_at);
+CREATE TABLE wallet_holdings
+(
+    updated_at DateTime('UTC'),
+    start_holding_at DateTime('UTC'),
+    wallet_address                  String,
+    mint_address                    String,
+    current_balance                 Float64,
+    realized_profit_pnl             Float32,
+    realized_profit_sol             Float64,
+    realized_profit_usd             Float64,
+    history_transfer_in             UInt32,
+    history_transfer_out            UInt32,
+    history_bought_amount           Float64,
+    history_bought_cost_sol         Float64,
+    history_sold_amount             Float64,
+    history_sold_income_sol         Float64
+)
+ENGINE = MergeTree
+ORDER BY (wallet_address, mint_address, updated_at);
+CREATE TABLE tokens (
+    updated_at DateTime('UTC'),
+    created_at DateTime('UTC'),
+    -- Core Identifiers
+    token_address           String,
+    name                    String,
+    symbol                  String,
+    token_uri               String,
+    -- Token Metadata
+    decimals                UInt8,
+    creator_address         String,
+    pool_addresses          Array(String), -- Map Vec<String> to Array(String)
+    -- Protocol/Launchpad
+    launchpad               UInt8,
+    protocol                UInt8,
+    total_supply            UInt64,
+    -- Authorities/Flags
+    is_mutable              Boolean, -- Alias for UInt8, but Boolean is clearer/modern
+    update_authority        Nullable(String), -- Map Option<String> to Nullable(String)
+    mint_authority          Nullable(String),
+    freeze_authority        Nullable(String)
+)
+ENGINE = ReplacingMergeTree(updated_at)
+PRIMARY KEY (token_address)
+ORDER BY (token_address, updated_at);
+-- Latest tokens (one row per token_address)
+CREATE TABLE tokens_latest
+(
+    updated_at DateTime('UTC'),
+    created_at DateTime('UTC'),
+    token_address           String,
+    name                    String,
+    symbol                  String,
+    token_uri               String,
+    decimals                UInt8,
+    creator_address         String,
+    pool_addresses          Array(String),
+    launchpad               UInt8,
+    protocol                UInt8,
+    total_supply            UInt64,
+    is_mutable              Boolean,
+    update_authority        Nullable(String),
+    mint_authority          Nullable(String),
+    freeze_authority        Nullable(String)
+)
+ENGINE = ReplacingMergeTree(updated_at)
+ORDER BY (token_address);
+CREATE TABLE token_metrics (
+    updated_at DateTime('UTC'),
+    token_address           String,
+    total_volume_usd        Float64,
+    total_buys              UInt32,
+    total_sells             UInt32,
+    unique_holders          UInt32,
+    ath_price_usd           Float64
+)
+ENGINE = MergeTree
+ORDER BY (token_address, updated_at);
+-- ========= Latest snapshot helper tables =========
+-- Keep full history in the base tables above, but read fast from these ReplacingMergeTree snapshots.
+-- Latest wallet profile metrics (one row per wallet_address)
+CREATE TABLE wallet_profile_metrics_latest
+(
+    updated_at DateTime('UTC'),
+    wallet_address                  String,
+    balance                         Float64,
+    transfers_in_count              UInt32,
+    transfers_out_count             UInt32,
+    spl_transfers_in_count          UInt32,
+    spl_transfers_out_count         UInt32,
+    total_buys_count                UInt32,
+    total_sells_count               UInt32,
+    total_winrate                   Float32,
+    stats_1d_realized_profit_sol    Float64,
+    stats_1d_realized_profit_usd    Float64,
+    stats_1d_realized_profit_pnl    Float32,
+    stats_1d_buy_count              UInt32,
+    stats_1d_sell_count             UInt32,
+    stats_1d_transfer_in_count      UInt32,
+    stats_1d_transfer_out_count     UInt32,
+    stats_1d_avg_holding_period     Float32,
+    stats_1d_total_bought_cost_sol  Float64,
+    stats_1d_total_bought_cost_usd  Float64,
+    stats_1d_total_sold_income_sol  Float64,
+    stats_1d_total_sold_income_usd  Float64,
+    stats_1d_total_fee              Float64,
+    stats_1d_winrate                Float32,
+    stats_1d_tokens_traded          UInt32,
+    stats_7d_realized_profit_sol    Float64,
+    stats_7d_realized_profit_usd    Float64,
+    stats_7d_realized_profit_pnl    Float32,
+    stats_7d_buy_count              UInt32,
+    stats_7d_sell_count             UInt32,
+    stats_7d_transfer_in_count      UInt32,
+    stats_7d_transfer_out_count     UInt32,
+    stats_7d_avg_holding_period     Float32,
+    stats_7d_total_bought_cost_sol  Float64,
+    stats_7d_total_bought_cost_usd  Float64,
+    stats_7d_total_sold_income_sol  Float64,
+    stats_7d_total_sold_income_usd  Float64,
+    stats_7d_total_fee              Float64,
+    stats_7d_winrate                Float32,
+    stats_7d_tokens_traded          UInt32,
+    stats_30d_realized_profit_sol   Float64,
+    stats_30d_realized_profit_usd   Float64,
+    stats_30d_realized_profit_pnl   Float32,
+    stats_30d_buy_count             UInt32,
+    stats_30d_sell_count            UInt32,
+    stats_30d_transfer_in_count     UInt32,
+    stats_30d_transfer_out_count    UInt32,
+    stats_30d_avg_holding_period    Float32,
+    stats_30d_total_bought_cost_sol Float64,
+    stats_30d_total_bought_cost_usd Float64,
+    stats_30d_total_sold_income_sol Float64,
+    stats_30d_total_sold_income_usd Float64,
+    stats_30d_total_fee             Float64,
+    stats_30d_winrate               Float32,
+    stats_30d_tokens_traded         UInt32
+)
+ENGINE = ReplacingMergeTree(updated_at)
+ORDER BY (wallet_address);
+-- Latest wallet holdings (one row per wallet_address + mint_address)
+CREATE TABLE wallet_holdings_latest
+(
+    updated_at DateTime('UTC'),
+    start_holding_at DateTime('UTC'),
+    wallet_address                  String,
+    mint_address                    String,
+    current_balance                 Float64,
+    realized_profit_pnl             Float32,
+    realized_profit_sol             Float64,
+    realized_profit_usd             Float64,
+    history_transfer_in             UInt32,
+    history_transfer_out            UInt32,
+    history_bought_amount           Float64,
+    history_bought_cost_sol         Float64,
+    history_sold_amount             Float64,
+    history_sold_income_sol         Float64
+)
+ENGINE = ReplacingMergeTree(updated_at)
+ORDER BY (wallet_address, mint_address);
+-- Latest token metrics (one row per token_address)
+CREATE TABLE token_metrics_latest
+(
+    updated_at DateTime('UTC'),
+    token_address           String,
+    total_volume_usd        Float64,
+    total_buys              UInt32,
+    total_sells             UInt32,
+    unique_holders          UInt32,
+    ath_price_usd           Float64
+)
+ENGINE = ReplacingMergeTree(updated_at)
+ORDER BY (token_address);
+CREATE TABLE known_wallets
+(
+    `wallet_address` String,
+    `name` String, -- e.g., "Pump.fun Fee Vault", "Raydium CPMM Authority V4", "KOL - Ansem"
+    `tag` String,  -- e.g., "fee_vault", "dex_authority", "kol", "exchange"
+)
+ENGINE = ReplacingMergeTree()
+ORDER BY (wallet_address);

pre_cache.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+python scripts/cache_dataset.py \
+--offset-utc 2024-01-01T00:00:00Z \
+--max-samples 100 \
+--out-dir data/cache/epoch_851 \
+--clickhouse-host localhost --clickhouse-port 9000 \
+--neo4j-uri bolt://localhost:7687

scripts/cache_dataset.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/env python3
+"""
+Script to pre-generate and cache dataset items from the OracleDataset.
+This script connects to the databases, instantiates the data loader in 'online' mode,
+and iterates through the requested number of samples, saving each processed item
+to a file. This avoids costly data fetching and processing during training.
+Example usage:
+  python scripts/cache_dataset.py --output-dir ./data/cached_dataset --max-samples 1000 --start-date 2024-05-01
+"""
+import argparse
+import datetime
+import os
+import sys
+from pathlib import Path
+import torch
+import clickhouse_connect
+from neo4j import GraphDatabase
+from tqdm import tqdm
+# Add apollo to path to import modules
+sys.path.append(str(Path(__file__).resolve().parents[1]))
+from data.data_loader import OracleDataset
+from data.data_fetcher import DataFetcher
+# --- Database Connection Details (can be overridden by env vars) ---
+CLICKHOUSE_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
+CLICKHOUSE_PORT = int(os.getenv("CLICKHOUSE_PORT", "8123"))
+CLICKHOUSE_USER = os.getenv("CLICKHOUSE_USER", "default")
+CLICKHOUSE_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
+CLICKHOUSE_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
+NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687")
+NEO4J_USER = os.getenv("NEO4J_USER", "neo4j")
+NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "password")
+def parse_args():
+    parser = argparse.ArgumentParser(description="Cache OracleDataset items to disk.")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        help="Directory to save the cached .pt files."
+    )
+    parser.add_argument(
+        "--max-samples",
+        type=int,
+        default=None,
+        help="Maximum number of samples to generate and cache. Defaults to all available."
+    )
+    parser.add_argument(
+        "--start-date",
+        type=str,
+        default=None,
+        help="Start date for fetching mints in YYYY-MM-DD format. Fetches all mints on or after this UTC date."
+    )
+    parser.add_argument(
+        "--t-cutoff-seconds",
+        type=int,
+        default=60,
+        help="Time in seconds after mint to set the data cutoff (T_cutoff)."
+    )
+    parser.add_argument(
+        "--ohlc-stats-path",
+        type=str,
+        default="./data/ohlc_stats.npz",
+        help="Path to the OHLC stats file for normalization."
+    )
+    parser.add_argument(
+        "--min-trade-usd",
+        type=float,
+        default=5.0,
+        help="Minimum USD value for a trade to be included in the event sequence. Defaults to 5.0."
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print(f"INFO: Caching dataset to {output_dir.resolve()}")
+    start_date_dt = None
+    if args.start_date:
+        try:
+            start_date_dt = datetime.datetime.strptime(args.start_date, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+            print(f"INFO: Filtering mints on or after {start_date_dt}")
+        except ValueError:
+            print(f"ERROR: Invalid start-date format. Please use YYYY-MM-DD.", file=sys.stderr)
+            sys.exit(1)
+    # --- 1. Set up database connections ---
+    try:
+        print("INFO: Connecting to ClickHouse...")
+        clickhouse_client = clickhouse_connect.get_client(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT, user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, database=CLICKHOUSE_DATABASE)
+        print("INFO: Connecting to Neo4j...")
+        neo4j_driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
+    except Exception as e:
+        print(f"ERROR: Failed to connect to databases: {e}", file=sys.stderr)
+        sys.exit(1)
+    # --- 2. Initialize DataFetcher and OracleDataset ---
+    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    dataset = OracleDataset(
+        data_fetcher=data_fetcher,
+        max_samples=args.max_samples,
+        start_date=start_date_dt,
+        t_cutoff_seconds=args.t_cutoff_seconds,
+        ohlc_stats_path=args.ohlc_stats_path,
+        horizons_seconds=[60, 300, 900, 1800, 3600],
+        quantiles=[0.5],
+        min_trade_usd=args.min_trade_usd
+    )
+    if len(dataset) == 0:
+        print("WARNING: Dataset initialization resulted in 0 samples. Nothing to cache.")
+        return
+    # --- 3. Iterate and cache each item ---
+    print(f"INFO: Starting to generate and cache {len(dataset)} samples...")
+    skipped_count = 0
+    for i in tqdm(range(len(dataset)), desc="Caching samples"):
+        try:
+            item = dataset.__cacheitem__(i)
+            if item is None:
+                skipped_count += 1
+                continue
+            output_path = output_dir / f"sample_{i}.pt"
+            torch.save(item, output_path)
+        except Exception as e:
+            print(f"\nERROR: Failed to generate or save sample {i} for mint '{dataset.sampled_mints[i]['mint_address']}'. Error: {e}", file=sys.stderr)
+            skipped_count += 1
+            continue
+    print(f"\n--- Caching Complete ---\nSuccessfully cached: {len(dataset) - skipped_count} items.\nSkipped: {skipped_count} items.\nCache location: {output_dir.resolve()}")
+    # --- 4. Close connections ---
+    clickhouse_client.close()
+    neo4j_driver.close()
+if __name__ == "__main__":
+    main()

scripts/download_epoch_artifacts.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+"""
+Download a specific epoch's parquet/Neo4j artifacts from Hugging Face.
+Usage:
+  HF_TOKEN=your_token \
+    python scripts/download_epoch_artifacts.py --epoch 851
+"""
+import argparse
+import os
+from pathlib import Path
+from typing import List
+from huggingface_hub import snapshot_download
+REPO_ID = "zirobtc/pump-fun-dataset"
+REPO_TYPE = "model"  # dataset is not used here per user note
+DEFAULT_DEST_DIR = "./data/pump_fun"
+# File stems that are suffixed with `_epoch_{epoch}.parquet`
+PARQUET_STEMS = [
+    "wallet_profiles",
+    "wallet_holdings",
+    "trades",
+    "transfers",
+    "burns",
+    "tokens",
+    "mints",
+    "liquidity",
+    "pool_creations",
+    "token_metrics",
+    "wallet_profile_metrics",
+    "migrations",
+    "fee_collections",
+    "supply_locks",
+    "supply_lock_actions",
+    "known_wallets",
+]
+# Single Neo4j dump name
+NEO4J_FILENAME = "neo4j_epoch_{epoch}.dump"
+def build_patterns(epoch: int) -> List[str]:
+    epoch_str = str(epoch)
+    parquet_patterns = [f"{stem}_epoch_{epoch_str}.parquet" for stem in PARQUET_STEMS]
+    neo4j_pattern = NEO4J_FILENAME.format(epoch=epoch_str)
+    return parquet_patterns + [neo4j_pattern]
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.")
+    parser.add_argument("--epoch", type=int,  required=False, help="Epoch number to download (e.g., 851)", default=851)
+    parser.add_argument(
+        "--token",
+        type=str,
+        default=None,
+        required=False,
+        help="Hugging Face token (or set HF_TOKEN env var)",
+    )
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    token = args.token or os.environ.get("HF_TOKEN")
+    patterns = build_patterns(args.epoch)
+    dest_root = Path(DEFAULT_DEST_DIR).expanduser()
+    dest_dir = dest_root / f"epoch_{args.epoch}"
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    print(f"Downloading epoch {args.epoch} files from {REPO_ID} to {dest_dir}")
+    print("Files:")
+    for p in patterns:
+        print(f"  - {p}")
+    snapshot_download(
+        repo_id=REPO_ID,
+        repo_type=REPO_TYPE,
+        local_dir=str(dest_dir),
+        local_dir_use_symlinks=False,
+        allow_patterns=patterns,
+        resume_download=True,
+        token=token,
+    )
+    print("Download complete.")
+if __name__ == "__main__":
+    main()

scripts/ingest_epoch.py ADDED Viewed

	@@ -0,0 +1,713 @@

+#!/usr/bin/env python3
+"""
+ETL Pipeline: Download epoch Parquet files, ingest into ClickHouse, and delete local files.
+Usage:
+  python scripts/ingest_epoch.py --epoch 851
+Environment Variables:
+  HF_TOKEN: Hugging Face token for downloading private datasets.
+  CLICKHOUSE_HOST, CLICKHOUSE_PORT, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD, CLICKHOUSE_DATABASE
+"""
+import argparse
+import os
+import sys
+import time
+from pathlib import Path
+import clickhouse_connect
+from huggingface_hub import snapshot_download
+from tqdm import tqdm
+# Hugging Face config
+REPO_ID = "zirobtc/pump-fun-dataset"
+REPO_TYPE = "model"
+DEFAULT_DEST_DIR = "./data/pump_fun"
+CLICKHOUSE_DOCKER_CONTAINER = "db-clickhouse"
+CLICKHOUSE_INSERT_SETTINGS = "max_insert_threads=1,max_block_size=65536"
+NEO4J_DOCKER_CONTAINER = "neo4j"
+NEO4J_TARGET_DB = "neo4j"
+NEO4J_TEMP_DB_PREFIX = "epoch"
+NEO4J_MERGE_BATCH_SIZE = 2000
+NEO4J_URI = "bolt://localhost:7687"
+NEO4J_USER = None
+NEO4J_PASSWORD = None
+# Parquet file stems -> ClickHouse table names
+# Maps the file stem to the target table. Usually they match.
+PARQUET_TABLE_MAP = {
+    "wallet_profiles": "wallet_profiles",
+    "wallet_holdings": "wallet_holdings",
+    "trades": "trades",
+    "transfers": "transfers",
+    "burns": "burns",
+    "tokens": "tokens",
+    "mints": "mints",
+    "liquidity": "liquidity",
+    "pool_creations": "pool_creations",
+    "token_metrics": "token_metrics",
+    "wallet_profile_metrics": "wallet_profile_metrics",
+    "migrations": "migrations",
+    "fee_collections": "fee_collections",
+    "supply_locks": "supply_locks",
+    "supply_lock_actions": "supply_lock_actions",
+    "known_wallets": "known_wallets",
+}
+# Neo4j dump filename pattern
+NEO4J_FILENAME = "neo4j_epoch_{epoch}.dump"
+# ClickHouse connection defaults (can be overridden by env vars)
+CH_HOST = os.getenv("CLICKHOUSE_HOST", "localhost")
+CH_PORT = int(os.getenv("CLICKHOUSE_PORT", "8123"))
+CH_USER = os.getenv("CLICKHOUSE_USER", "default")
+CH_PASSWORD = os.getenv("CLICKHOUSE_PASSWORD", "")
+CH_DATABASE = os.getenv("CLICKHOUSE_DATABASE", "default")
+def build_patterns(epoch: int) -> list[str]:
+    """Build the list of file patterns to download for a given epoch."""
+    epoch_str = str(epoch)
+    parquet_patterns = [f"{stem}_epoch_{epoch_str}.parquet" for stem in PARQUET_TABLE_MAP.keys()]
+    neo4j_pattern = NEO4J_FILENAME.format(epoch=epoch_str)
+    return parquet_patterns + [neo4j_pattern]
+def download_epoch(epoch: int, dest_dir: Path, token: str | None) -> None:
+    """Download epoch artifacts from Hugging Face."""
+    patterns = build_patterns(epoch)
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    print(f"📥 Downloading epoch {epoch} from {REPO_ID}...")
+    snapshot_download(
+        repo_id=REPO_ID,
+        repo_type=REPO_TYPE,
+        local_dir=str(dest_dir),
+        local_dir_use_symlinks=False,
+        allow_patterns=patterns,
+        resume_download=True,
+        token=token,
+    )
+    print("✅ Download complete.")
+def ingest_parquet(client, table_name: str, parquet_path: Path, dry_run: bool = False) -> bool:
+    """
+    Ingest a Parquet file into a ClickHouse table.
+    Returns True on success.
+    """
+    if dry_run:
+        print(f"  [DRY-RUN] insert {parquet_path.name} -> {table_name}")
+        return True
+    try:
+        with parquet_path.open("rb") as fh:
+            magic = fh.read(4)
+        if magic != b"PAR1":
+            print(f"  ⚠️  Skipping {parquet_path.name}: not a Parquet file.")
+            return False
+        # clickhouse-connect (HTTP) doesn't support FROM INFILE; prefer streaming inserts.
+        # Using insert_file can still be memory-heavy for large Parquet on some setups.
+        import subprocess
+        insert_query = f"INSERT INTO {table_name} FORMAT Parquet SETTINGS {CLICKHOUSE_INSERT_SETTINGS}"
+        infile_query = f"INSERT INTO {table_name} FROM INFILE '{parquet_path.resolve()}' FORMAT Parquet"
+        try:
+            cmd = [
+                "clickhouse-client",
+                "--host", CH_HOST,
+                "--port", str(CH_PORT),
+                "--user", CH_USER,
+                "--password", CH_PASSWORD,
+                "--database", CH_DATABASE,
+                "--query", infile_query,
+            ]
+            subprocess.run(cmd, check=True)
+            return True
+        except FileNotFoundError:
+            pass
+        # Docker fallback for ClickHouse container
+        ch_container = CLICKHOUSE_DOCKER_CONTAINER
+        try:
+            tmp_path = f"/tmp/{parquet_path.name}"
+            subprocess.run(
+                ["docker", "cp", str(parquet_path), f"{ch_container}:{tmp_path}"],
+                check=True,
+            )
+            docker_cmd = [
+                "docker", "exec", ch_container,
+                "clickhouse-client",
+                "--query", f"INSERT INTO {table_name} FROM INFILE '{tmp_path}' FORMAT Parquet",
+            ]
+            subprocess.run(docker_cmd, check=True)
+            subprocess.run(["docker", "exec", ch_container, "rm", "-f", tmp_path], check=True)
+            return True
+        except FileNotFoundError:
+            raise RuntimeError(
+                "clickhouse-client not found and docker is unavailable. Install clickhouse-client or use a ClickHouse container."
+            )
+    except Exception as e:
+        print(f"  ❌ Failed to ingest {parquet_path.name}: {e}")
+        return False
+def run_etl(epoch: int, dest_dir: Path, client, dry_run: bool = False, token: str | None = None, skip_neo4j: bool = False, skip_clickhouse: bool = False) -> None:
+    """
+    Full ETL pipeline:
+    1. Use local Parquet files (no download)
+    2. Ingest into ClickHouse
+    3. Keep local files (no deletion)
+    """
+    if not dest_dir.exists():
+        raise FileNotFoundError(f"Epoch directory not found: {dest_dir}")
+    if not skip_clickhouse:
+        # Step 2: Ingest each Parquet file
+        print(f"\n📤 Ingesting Parquet files into ClickHouse...")
+        for stem, table_name in tqdm(PARQUET_TABLE_MAP.items(), desc="Ingesting"):
+            parquet_path = dest_dir / f"{stem}_epoch_{epoch}.parquet"
+            if not parquet_path.exists():
+                print(f"  ⚠️  Skipping {stem}: file not found.")
+                continue
+            ingest_parquet(client, table_name, parquet_path, dry_run=dry_run)
+        print("\n✅ ClickHouse ingestion complete.")
+    else:
+        print("\nℹ️  ClickHouse ingestion skipped.")
+    # Step 4: Neo4j dump
+    neo4j_path = dest_dir / NEO4J_FILENAME.format(epoch=epoch)
+    if neo4j_path.exists() and not skip_neo4j:
+        merge_neo4j_epoch_dump(epoch, neo4j_path, dry_run=dry_run)
+    elif neo4j_path.exists() and skip_neo4j:
+        print(f"\nℹ️  Neo4j dump found but skipped: {neo4j_path}")
+    print("\n🎉 Full ETL pipeline complete.")
+def ingest_neo4j_dump(dump_path: Path, database: str = "neo4j", dry_run: bool = False) -> bool:
+    """
+    Load a Neo4j dump file into the database.
+    Requires neo4j-admin CLI and the Neo4j service to be stopped.
+    Returns True on success.
+    """
+    import subprocess
+    if not dump_path.exists():
+        print(f"  ⚠️  Neo4j dump not found: {dump_path}")
+        return False
+    import shutil
+    expected_dump_name = f"{database}.dump"
+    load_dir = dump_path.parent
+    temp_load_dir = None
+    if dump_path.name != expected_dump_name:
+        temp_load_dir = dump_path.parent / f"_neo4j_load_{database}"
+        temp_load_dir.mkdir(parents=True, exist_ok=True)
+        load_dump_path = temp_load_dir / expected_dump_name
+        shutil.copy2(dump_path, load_dump_path)
+        load_dir = temp_load_dir
+    # neo4j-admin database load requires a directory containing <database>.dump
+    # For Neo4j 5.x: neo4j-admin database load --from-path=<dir> <database>
+    # Note: User must clear the database before loading (no --overwrite flag)
+    cmd = [
+        "neo4j-admin", "database", "load",
+        f"--from-path={load_dir.resolve()}",
+        database,
+    ]
+    if dry_run:
+        print(f"  [DRY-RUN] {' '.join(cmd)}")
+        return True
+    print(f"🔄 Loading Neo4j dump into database '{database}'...")
+    print("   ⚠️  Neo4j must be stopped for offline load.")
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+        print("  ✅ Neo4j dump loaded successfully.")
+        return True
+    except FileNotFoundError:
+        # Fall back to dockerized neo4j-admin if available
+        docker_container = NEO4J_DOCKER_CONTAINER
+        try:
+            docker_ps = subprocess.run(
+                ["docker", "ps", "-a", "--format", "{{.Names}}\t{{.Image}}"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+        except FileNotFoundError:
+            print("  ❌ neo4j-admin not found and docker is unavailable.")
+            return False
+        except subprocess.CalledProcessError as e:
+            print(f"  ❌ Failed to list docker containers: {e.stderr}")
+            return False
+        containers = [line.strip().split("\t") for line in docker_ps.stdout.splitlines() if line.strip()]
+        container_names = {name for name, _ in containers}
+        if docker_container not in container_names:
+            # Try to auto-detect a neo4j container if the default name isn't found.
+            neo4j_candidates = [name for name, image in containers if image.startswith("neo4j")]
+            if neo4j_candidates:
+                docker_container = neo4j_candidates[0]
+                print(f"  ℹ️  Using detected Neo4j container '{docker_container}'.")
+            else:
+                print(f"  ❌ neo4j-admin not found and docker container '{docker_container}' does not exist.")
+                return False
+        docker_running = subprocess.run(
+            ["docker", "ps", "--format", "{{.Names}}"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        running = set(line.strip() for line in docker_running.stdout.splitlines() if line.strip())
+        was_running = docker_container in running
+        if was_running:
+            print(f"  🛑 Stopping Neo4j container '{docker_container}' for offline load...")
+            if dry_run:
+                print(f"  [DRY-RUN] docker stop {docker_container}")
+            else:
+                subprocess.run(["docker", "stop", docker_container], check=True)
+        dump_name = dump_path.name
+        docker_cmd = [
+            "docker", "run", "--rm",
+            "--volumes-from", docker_container,
+            "-v", f"{load_dir.resolve()}:/dump",
+            "neo4j:latest",
+            "neo4j-admin", "database", "load",
+            f"--from-path=/dump",
+            "--overwrite-destination",
+            database,
+        ]
+        if dry_run:
+            print(f"  [DRY-RUN] {' '.join(docker_cmd)}")
+        else:
+            print(f"  🔄 Running neo4j-admin in docker for {dump_name}...")
+            subprocess.run(docker_cmd, check=True)
+            print("  ✅ Neo4j dump loaded successfully (docker).")
+        if was_running:
+            print(f"  ▶️  Starting Neo4j container '{docker_container}'...")
+            if dry_run:
+                print(f"  [DRY-RUN] docker start {docker_container}")
+            else:
+                subprocess.run(["docker", "start", docker_container], check=True)
+                _wait_for_bolt(NEO4J_URI)
+        if temp_load_dir and not dry_run:
+            shutil.rmtree(temp_load_dir, ignore_errors=True)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"  ❌ Failed to load Neo4j dump: {e.stderr}")
+        if temp_load_dir and not dry_run:
+            shutil.rmtree(temp_load_dir, ignore_errors=True)
+        return False
+def _neo4j_driver():
+    from neo4j import GraphDatabase
+    if NEO4J_USER is None and NEO4J_PASSWORD is None:
+        return GraphDatabase.driver(NEO4J_URI, auth=None)
+    return GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
+def _run_merge_batch(tx, query: str, rows: list[dict]) -> None:
+    tx.run(query, rows=rows)
+def _stream_merge(temp_session, target_session, match_query: str, merge_query: str, label: str) -> None:
+    batch = []
+    result = temp_session.run(match_query, fetch_size=NEO4J_MERGE_BATCH_SIZE)
+    for record in result:
+        batch.append(record.data())
+        if len(batch) >= NEO4J_MERGE_BATCH_SIZE:
+            target_session.execute_write(_run_merge_batch, merge_query, batch)
+            batch.clear()
+    if batch:
+        target_session.execute_write(_run_merge_batch, merge_query, batch)
+def _wait_for_bolt(uri: str, timeout_sec: int = 60) -> None:
+    from neo4j import GraphDatabase
+    start = time.time()
+    while True:
+        try:
+            temp_driver = GraphDatabase.driver(uri, auth=None)
+            with temp_driver.session(database="neo4j") as session:
+                session.run("RETURN 1").consume()
+            temp_driver.close()
+            return
+        except Exception:
+            if time.time() - start > timeout_sec:
+                raise RuntimeError(f"Timed out waiting for Neo4j at {uri}")
+            time.sleep(1)
+def _start_temp_neo4j_from_dump(epoch: int, dump_path: Path) -> tuple[str, str, str, Path]:
+    import subprocess
+    import shutil
+    expected_dump_name = "neo4j.dump"
+    temp_load_dir = dump_path.parent / f"_neo4j_load_{epoch}"
+    temp_load_dir.mkdir(parents=True, exist_ok=True)
+    load_dump_path = temp_load_dir / expected_dump_name
+    shutil.copy2(dump_path, load_dump_path)
+    volume_name = f"neo4j_tmp_{epoch}"
+    subprocess.run(["docker", "volume", "create", volume_name], check=True)
+    subprocess.run(
+        [
+            "docker", "run", "--rm",
+            "-v", f"{volume_name}:/data",
+            "-v", f"{temp_load_dir.resolve()}:/dump",
+            "neo4j:latest",
+            "neo4j-admin", "database", "load",
+            "--from-path=/dump",
+            "--overwrite-destination",
+            "neo4j",
+        ],
+        check=True,
+    )
+    container_id = subprocess.check_output(
+        [
+            "docker", "run", "-d", "--rm",
+            "-e", "NEO4J_AUTH=none",
+            "-v", f"{volume_name}:/data",
+            "-p", "0:7687",
+            "neo4j:latest",
+        ],
+        text=True,
+    ).strip()
+    port_out = subprocess.check_output(
+        ["docker", "port", container_id, "7687/tcp"],
+        text=True,
+    ).strip()
+    host_port = port_out.split(":")[-1]
+    bolt_uri = f"bolt://localhost:{host_port}"
+    return container_id, bolt_uri, volume_name, temp_load_dir
+def merge_neo4j_epoch_dump(epoch: int, dump_path: Path, dry_run: bool = False) -> None:
+    print(f"\n🧩 Merging Neo4j dump into '{NEO4J_TARGET_DB}' via temp container...")
+    if dry_run:
+        _start_temp_neo4j_from_dump(epoch, dump_path)
+        print("  [DRY-RUN] merge skipped.")
+        return
+    temp_container_id = None
+    temp_volume = None
+    temp_load_dir = None
+    temp_driver = None
+    temp_db_name = "neo4j"
+    temp_container_id, temp_bolt_uri, temp_volume, temp_load_dir = _start_temp_neo4j_from_dump(epoch, dump_path)
+    _wait_for_bolt(temp_bolt_uri)
+    from neo4j import GraphDatabase
+    temp_driver = GraphDatabase.driver(temp_bolt_uri, auth=None)
+    _wait_for_bolt(NEO4J_URI)
+    driver = _neo4j_driver()
+    try:
+        with temp_driver.session(database=temp_db_name) as temp_session, driver.session(database=NEO4J_TARGET_DB) as target_session:
+            # Wallet nodes
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (w:Wallet) RETURN w.address AS address",
+                "UNWIND $rows AS t MERGE (w:Wallet {address: t.address})",
+                "wallets",
+            )
+            # Token nodes
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (t:Token) RETURN t.address AS address, t.created_ts AS created_ts",
+                "UNWIND $rows AS t MERGE (k:Token {address: t.address}) "
+                "ON CREATE SET k.created_ts = t.created_ts "
+                "ON MATCH SET k.created_ts = CASE WHEN k.created_ts IS NULL OR "
+                "t.created_ts < k.created_ts THEN t.created_ts ELSE k.created_ts END",
+                "tokens",
+            )
+            # BUNDLE_TRADE
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (a:Wallet)-[r:BUNDLE_TRADE]->(b:Wallet) "
+                "RETURN a.address AS wa, b.address AS wb, r.mint AS mint, r.slot AS slot, "
+                "r.timestamp AS timestamp, r.signatures AS signatures",
+                "UNWIND $rows AS t "
+                "MERGE (a:Wallet {address: t.wa}) "
+                "MERGE (b:Wallet {address: t.wb}) "
+                "MERGE (a)-[r:BUNDLE_TRADE {mint: t.mint, slot: t.slot}]->(b) "
+                "ON CREATE SET r.timestamp = t.timestamp, r.signatures = t.signatures "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "bundle_trade",
+            )
+            # TRANSFERRED_TO
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (s:Wallet)-[r:TRANSFERRED_TO]->(d:Wallet) "
+                "RETURN s.address AS source, d.address AS destination, r.mint AS mint, "
+                "r.signature AS signature, r.timestamp AS timestamp, r.amount AS amount",
+                "UNWIND $rows AS t "
+                "MERGE (s:Wallet {address: t.source}) "
+                "MERGE (d:Wallet {address: t.destination}) "
+                "MERGE (s)-[r:TRANSFERRED_TO {mint: t.mint}]->(d) "
+                "ON CREATE SET r.signature = t.signature, r.timestamp = t.timestamp, r.amount = t.amount "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "transfer",
+            )
+            # COORDINATED_ACTIVITY
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (f:Wallet)-[r:COORDINATED_ACTIVITY]->(l:Wallet) "
+                "RETURN f.address AS follower, l.address AS leader, r.mint AS mint, r.timestamp AS timestamp, "
+                "r.leader_first_sig AS leader_first_sig, r.leader_second_sig AS leader_second_sig, "
+                "r.follower_first_sig AS follower_first_sig, r.follower_second_sig AS follower_second_sig, "
+                "r.time_gap_on_first_sec AS gap_1, r.time_gap_on_second_sec AS gap_2",
+                "UNWIND $rows AS t "
+                "MERGE (l:Wallet {address: t.leader}) "
+                "MERGE (f:Wallet {address: t.follower}) "
+                "MERGE (f)-[r:COORDINATED_ACTIVITY {mint: t.mint}]->(l) "
+                "ON CREATE SET r.timestamp = t.timestamp, r.leader_first_sig = t.leader_first_sig, "
+                "r.leader_second_sig = t.leader_second_sig, r.follower_first_sig = t.follower_first_sig, "
+                "r.follower_second_sig = t.follower_second_sig, r.time_gap_on_first_sec = t.gap_1, "
+                "r.time_gap_on_second_sec = t.gap_2 "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "coordinated_activity",
+            )
+            # COPIED_TRADE
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (f:Wallet)-[r:COPIED_TRADE]->(l:Wallet) "
+                "RETURN f.address AS follower, l.address AS leader, r.mint AS mint, r.timestamp AS timestamp, "
+                "r.buy_gap AS buy_gap, r.sell_gap AS sell_gap, r.leader_pnl AS leader_pnl, "
+                "r.follower_pnl AS follower_pnl, r.l_buy_sig AS l_buy_sig, r.l_sell_sig AS l_sell_sig, "
+                "r.f_buy_sig AS f_buy_sig, r.f_sell_sig AS f_sell_sig, r.l_buy_total AS l_buy_total, "
+                "r.l_sell_total AS l_sell_total, r.f_buy_total AS f_buy_total, r.f_sell_total AS f_sell_total, "
+                "r.f_buy_slip AS f_buy_slip, r.f_sell_slip AS f_sell_slip",
+                "UNWIND $rows AS t "
+                "MERGE (f:Wallet {address: t.follower}) "
+                "MERGE (l:Wallet {address: t.leader}) "
+                "MERGE (f)-[r:COPIED_TRADE {mint: t.mint}]->(l) "
+                "ON CREATE SET r.timestamp = t.timestamp, r.follower = t.follower, r.leader = t.leader, "
+                "r.mint = t.mint, r.buy_gap = t.buy_gap, r.sell_gap = t.sell_gap, r.leader_pnl = t.leader_pnl, "
+                "r.follower_pnl = t.follower_pnl, r.l_buy_sig = t.l_buy_sig, r.l_sell_sig = t.l_sell_sig, "
+                "r.f_buy_sig = t.f_buy_sig, r.f_sell_sig = t.f_sell_sig, r.l_buy_total = t.l_buy_total, "
+                "r.l_sell_total = t.l_sell_total, r.f_buy_total = t.f_buy_total, r.f_sell_total = t.f_sell_total, "
+                "r.f_buy_slip = t.f_buy_slip, r.f_sell_slip = t.f_sell_slip "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "copied_trade",
+            )
+            # MINTED
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (c:Wallet)-[r:MINTED]->(k:Token) "
+                "RETURN c.address AS creator, k.address AS token, r.signature AS signature, "
+                "r.timestamp AS timestamp, r.buy_amount AS buy_amount",
+                "UNWIND $rows AS t "
+                "MERGE (c:Wallet {address: t.creator}) "
+                "MERGE (k:Token {address: t.token}) "
+                "MERGE (c)-[r:MINTED {signature: t.signature}]->(k) "
+                "ON CREATE SET r.timestamp = t.timestamp, r.buy_amount = t.buy_amount "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "minted",
+            )
+            # SNIPED
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (w:Wallet)-[r:SNIPED]->(k:Token) "
+                "RETURN w.address AS wallet, k.address AS token, r.signature AS signature, "
+                "r.rank AS rank, r.sniped_amount AS sniped_amount, r.timestamp AS timestamp",
+                "UNWIND $rows AS t "
+                "MERGE (w:Wallet {address: t.wallet}) "
+                "MERGE (k:Token {address: t.token}) "
+                "MERGE (w)-[r:SNIPED {signature: t.signature}]->(k) "
+                "ON CREATE SET r.rank = t.rank, r.sniped_amount = t.sniped_amount, r.timestamp = t.timestamp "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "sniped",
+            )
+            # LOCKED_SUPPLY
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (s:Wallet)-[r:LOCKED_SUPPLY]->(k:Token) "
+                "RETURN s.address AS sender, k.address AS mint, r.signature AS signature, "
+                "r.amount AS amount, r.unlock_timestamp AS unlock_ts, r.recipient AS recipient, "
+                "r.timestamp AS timestamp",
+                "UNWIND $rows AS t "
+                "MERGE (s:Wallet {address: t.sender}) "
+                "MERGE (k:Token {address: t.mint}) "
+                "MERGE (s)-[r:LOCKED_SUPPLY {signature: t.signature}]->(k) "
+                "ON CREATE SET r.amount = t.amount, r.unlock_timestamp = t.unlock_ts, "
+                "r.recipient = t.recipient, r.timestamp = t.timestamp "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "locked_supply",
+            )
+            # BURNED
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (w:Wallet)-[r:BURNED]->(k:Token) "
+                "RETURN w.address AS wallet, k.address AS token, r.signature AS signature, "
+                "r.amount AS amount, r.timestamp AS timestamp",
+                "UNWIND $rows AS t "
+                "MERGE (w:Wallet {address: t.wallet}) "
+                "MERGE (k:Token {address: t.token}) "
+                "MERGE (w)-[r:BURNED {signature: t.signature}]->(k) "
+                "ON CREATE SET r.amount = t.amount, r.timestamp = t.timestamp "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "burned",
+            )
+            # PROVIDED_LIQUIDITY
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (w:Wallet)-[r:PROVIDED_LIQUIDITY]->(k:Token) "
+                "RETURN w.address AS wallet, k.address AS token, r.signature AS signature, "
+                "r.pool_address AS pool_address, r.amount_base AS amount_base, "
+                "r.amount_quote AS amount_quote, r.timestamp AS timestamp",
+                "UNWIND $rows AS t "
+                "MERGE (w:Wallet {address: t.wallet}) "
+                "MERGE (k:Token {address: t.token}) "
+                "MERGE (w)-[r:PROVIDED_LIQUIDITY {signature: t.signature}]->(k) "
+                "ON CREATE SET r.pool_address = t.pool_address, r.amount_base = t.amount_base, "
+                "r.amount_quote = t.amount_quote, r.timestamp = t.timestamp "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "provided_liquidity",
+            )
+            # TOP_TRADER_OF
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (w:Wallet)-[r:TOP_TRADER_OF]->(k:Token) "
+                "RETURN w.address AS wallet, k.address AS token, r.pnl_at_creation AS pnl_at_creation, "
+                "r.ath_usd_at_creation AS ath_at_creation, r.timestamp AS timestamp",
+                "UNWIND $rows AS t "
+                "MERGE (w:Wallet {address: t.wallet}) "
+                "MERGE (k:Token {address: t.token}) "
+                "MERGE (w)-[r:TOP_TRADER_OF]->(k) "
+                "ON CREATE SET r.pnl_at_creation = t.pnl_at_creation, r.ath_usd_at_creation = t.ath_at_creation, "
+                "r.timestamp = t.timestamp "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "top_trader_of",
+            )
+            # WHALE_OF
+            _stream_merge(
+                temp_session,
+                target_session,
+                "MATCH (w:Wallet)-[r:WHALE_OF]->(k:Token) "
+                "RETURN w.address AS wallet, k.address AS token, r.holding_pct_at_creation AS pct_at_creation, "
+                "r.ath_usd_at_creation AS ath_at_creation, r.timestamp AS timestamp",
+                "UNWIND $rows AS t "
+                "MERGE (w:Wallet {address: t.wallet}) "
+                "MERGE (k:Token {address: t.token}) "
+                "MERGE (w)-[r:WHALE_OF]->(k) "
+                "ON CREATE SET r.holding_pct_at_creation = t.pct_at_creation, "
+                "r.ath_usd_at_creation = t.ath_at_creation, r.timestamp = t.timestamp "
+                "ON MATCH SET r.timestamp = CASE WHEN r.timestamp IS NULL OR "
+                "t.timestamp < r.timestamp THEN t.timestamp ELSE r.timestamp END",
+                "whale_of",
+            )
+    finally:
+        driver.close()
+    try:
+        if temp_driver:
+            temp_driver.close()
+        if temp_container_id:
+            import subprocess
+            subprocess.run(["docker", "stop", temp_container_id], check=True)
+        if temp_volume:
+            import subprocess
+            subprocess.run(["docker", "volume", "rm", "-f", temp_volume], check=True)
+        if temp_load_dir:
+            import shutil
+            shutil.rmtree(temp_load_dir, ignore_errors=True)
+        print("  🧹 Dropped temp Neo4j container.")
+    except Exception as e:
+        print(f"  ⚠️  Failed to clean temp Neo4j container: {e}")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="ETL: Download, Ingest, Delete epoch Parquet files.")
+    parser.add_argument("--epoch", type=int, required=True, help="Epoch number to process (e.g., 851)")
+    parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Skip ClickHouse ingestion")
+    parser.add_argument("--dry-run", action="store_true", help="Print queries without executing")
+    parser.add_argument("--skip-neo4j", action="store_true", help="Skip Neo4j dump loading")
+    parser.add_argument("--token", type=str, default=None, help="Hugging Face token (or set HF_TOKEN env var)")
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    token = args.token or os.environ.get("HF_TOKEN")
+    dest_dir = Path(DEFAULT_DEST_DIR).expanduser() / f"epoch_{args.epoch}"
+    # Connect to ClickHouse
+    print(f"🔌 Connecting to ClickHouse at {CH_HOST}:{CH_PORT}...")
+    try:
+        client = clickhouse_connect.get_client(
+            host=CH_HOST,
+            port=CH_PORT,
+            username=CH_USER,
+            password=CH_PASSWORD,
+            database=CH_DATABASE,
+        )
+    except Exception as e:
+        print(f"❌ Failed to connect to ClickHouse: {e}")
+        sys.exit(1)
+    run_etl(
+        epoch=args.epoch,
+        dest_dir=dest_dir,
+        client=client,
+        dry_run=args.dry_run,
+        token=token,
+        skip_neo4j=args.skip_neo4j,
+        skip_clickhouse=args.skip_clickhouse,
+    )
+if __name__ == "__main__":
+    main()

train.py ADDED Viewed

	@@ -0,0 +1,465 @@

+import os
+import argparse
+import math
+import logging
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+# Ensure torch/dill have a writable tmp dir
+_DEFAULT_TMP = Path(os.getenv("TMPDIR_OVERRIDE", "./.tmp"))
+_DEFAULT_TMP.mkdir(parents=True, exist_ok=True)
+resolved_tmp = str(_DEFAULT_TMP.resolve())
+for key in ("TMPDIR", "TMP", "TEMP"):
+    os.environ.setdefault(key, resolved_tmp)
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torch.optim import AdamW
+# --- Accelerate & Transformers ---
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import ProjectConfiguration, set_seed
+from transformers import get_linear_schedule_with_warmup
+# Logging
+from tqdm.auto import tqdm
+# DB Clients
+from clickhouse_driver import Client as ClickHouseClient
+from neo4j import GraphDatabase
+# Local Imports
+from data.data_fetcher import DataFetcher
+from data.data_loader import OracleDataset
+from data.data_collator import MemecoinCollator
+from models.multi_modal_processor import MultiModalEncoder
+from models.helper_encoders import ContextualTimeEncoder
+from models.token_encoder import TokenEncoder
+from models.wallet_encoder import WalletEncoder
+from models.graph_updater import GraphUpdater
+from models.ohlc_embedder import OHLCEmbedder
+from models.model import Oracle
+import models.vocabulary as vocab
+# Setup Logger
+logger = get_logger(__name__)
+def compute_gradient_stats(model: nn.Module) -> Tuple[Optional[Dict[str, float]], Dict[str, float]]:
+    """Return overall and per-module gradient statistics for logging."""
+    grad_norms: List[float] = []
+    max_abs = 0.0
+    module_l2_sums: Dict[str, float] = {}
+    for name, param in model.named_parameters():
+        if param.grad is None:
+            continue
+        grad = param.grad.detach()
+        grad_norm = grad.norm().item()
+        grad_norms.append(grad_norm)
+        max_abs = max(max_abs, grad.abs().max().item())
+        module_name = name.split(".", 1)[0]
+        grad_fp32 = grad.float()
+        module_l2_sums[module_name] = module_l2_sums.get(module_name, 0.0) + float(grad_fp32.pow(2).sum().item())
+    if not grad_norms:
+        return None, {}
+    module_grad_norms = {module: math.sqrt(total) for module, total in module_l2_sums.items()}
+    return {
+        "grad_norm_mean": float(sum(grad_norms) / len(grad_norms)),
+        "grad_norm_max": float(max(grad_norms)),
+        "grad_abs_max": float(max_abs),
+    }, module_grad_norms
+def quantile_pinball_loss(preds: torch.Tensor,
+                          targets: torch.Tensor,
+                          mask: torch.Tensor,
+                          quantiles: List[float]) -> torch.Tensor:
+    """
+    Calculates Pinball Loss for quantile regression.
+    """
+    if mask.sum() == 0:
+        return torch.tensor(0.0, device=preds.device, dtype=preds.dtype)
+    num_quantiles = len(quantiles)
+    losses = []
+    for idx, q in enumerate(quantiles):
+        # Preds shape: [B, Horizons * Quantiles]
+        # Logic assumes interleaved outputs or consistent flattening.
+        pred_slice = preds[:, idx::num_quantiles]
+        target_slice = targets[:, idx::num_quantiles]
+        mask_slice = mask[:, idx::num_quantiles]
+        diff = target_slice - pred_slice
+        pinball = torch.maximum((q - 1.0) * diff, q * diff)
+        losses.append((pinball * mask_slice).sum())
+    return sum(losses) / mask.sum().clamp_min(1.0)
+def filtered_collate(collator: MemecoinCollator,
+                     batch: List[Optional[Dict[str, Any]]]) -> Optional[Dict[str, Any]]:
+    """Filter out None items from the dataset before collating."""
+    batch = [item for item in batch if item is not None]
+    if not batch:
+        return None
+    return collator(batch)
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Train the Oracle quantile model.")
+    parser.add_argument("--epochs", type=int, default=1)
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--learning_rate", type=float, default=5e-5)
+    parser.add_argument("--warmup_ratio", type=float, default=0.1)
+    parser.add_argument("--grad_accum_steps", type=int, default=1)
+    parser.add_argument("--max_grad_norm", type=float, default=1.0)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--log_every", type=int, default=1)
+    parser.add_argument("--save_every", type=int, default=1000)
+    parser.add_argument("--tensorboard_dir", type=str, default="runs/oracle")
+    parser.add_argument("--checkpoint_dir", type=str, default="checkpoints")
+    parser.add_argument("--mixed_precision", type=str, default="bf16")
+    parser.add_argument("--max_seq_len", type=int, default=16000)
+    parser.add_argument("--ohlc_seq_len", type=int, default=60)
+    parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
+    parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--ohlc_stats_path", type=str, default="./data/ohlc_stats.npz")
+    parser.add_argument("--t_cutoff_seconds", type=int, default=60)
+    parser.add_argument("--shuffle", dest="shuffle", action="store_true", default=True)
+    parser.add_argument("--no-shuffle", dest="shuffle", action="store_false")
+    parser.add_argument("--num_workers", type=int, default=0)
+    parser.add_argument("--pin_memory", dest="pin_memory", action="store_true", default=False)
+    parser.add_argument("--no-pin_memory", dest="pin_memory", action="store_false")
+    parser.add_argument("--clickhouse_host", type=str, default="localhost")
+    parser.add_argument("--clickhouse_port", type=int, default=9000)
+    parser.add_argument("--neo4j_uri", type=str, default="bolt://localhost:7687")
+    parser.add_argument("--neo4j_user", type=str, default=None)
+    parser.add_argument("--neo4j_password", type=str, default=None)
+    return parser.parse_args()
+def main() -> None:
+    args = parse_args()
+    epochs = args.epochs
+    batch_size = args.batch_size
+    learning_rate = args.learning_rate
+    warmup_ratio = args.warmup_ratio
+    grad_accum_steps = args.grad_accum_steps
+    max_grad_norm = args.max_grad_norm
+    seed = args.seed
+    log_every = args.log_every
+    save_every = args.save_every
+    tensorboard_dir = Path(args.tensorboard_dir).expanduser()
+    checkpoint_dir = Path(args.checkpoint_dir).expanduser()
+    # --- 1. Initialize Accelerator ---
+    project_config = ProjectConfiguration(project_dir=str(checkpoint_dir), logging_dir=str(tensorboard_dir))
+    accelerator = Accelerator(
+        gradient_accumulation_steps=grad_accum_steps,
+        log_with="tensorboard",
+        project_config=project_config,
+        mixed_precision=args.mixed_precision # Default to bf16 for stability
+    )
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    # Set seed for reproducibility
+    set_seed(seed)
+    if accelerator.is_main_process:
+        logger.info("Initialized with CLI arguments.")
+        tensorboard_dir.mkdir(parents=True, exist_ok=True)
+        checkpoint_dir.mkdir(parents=True, exist_ok=True)
+        accelerator.init_trackers("oracle_training")
+    device = accelerator.device
+    # Determine dtype for model initialization
+    init_dtype = torch.float32
+    if accelerator.mixed_precision == 'bf16':
+        init_dtype = torch.bfloat16
+    elif accelerator.mixed_precision == 'fp16':
+        init_dtype = torch.float16
+    # --- 2. Data Setup ---
+    horizons = args.horizons_seconds
+    quantiles = args.quantiles
+    max_seq_len = args.max_seq_len
+    ohlc_seq_len = args.ohlc_seq_len
+    logger.info(f"Initializing Encoders with dtype={init_dtype}...")
+    # Encoders
+    multi_modal_encoder = MultiModalEncoder(dtype=init_dtype)
+    time_encoder = ContextualTimeEncoder(dtype=init_dtype)
+    token_encoder = TokenEncoder(multi_dim=multi_modal_encoder.embedding_dim, dtype=init_dtype)
+    wallet_encoder = WalletEncoder(encoder=multi_modal_encoder, dtype=init_dtype)
+    graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=init_dtype)
+    ohlc_embedder = OHLCEmbedder(
+        num_intervals=vocab.NUM_OHLC_INTERVALS,
+        sequence_length=ohlc_seq_len,
+        dtype=init_dtype
+    )
+    collator = MemecoinCollator(
+        event_type_to_id=vocab.EVENT_TO_ID,
+        device=device, # Note: Collator will handle basic moves, Accelerate handles the rest
+        multi_modal_encoder=multi_modal_encoder,
+        dtype=init_dtype,
+        ohlc_seq_len=ohlc_seq_len,
+        max_seq_len=max_seq_len
+    )
+    # DB Connections
+    clickhouse_client = ClickHouseClient(
+        host=args.clickhouse_host,
+        port=int(args.clickhouse_port)
+    )
+    neo4j_auth = None
+    if args.neo4j_user is not None:
+        neo4j_auth = (args.neo4j_user, args.neo4j_password or "")
+    neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=neo4j_auth)
+    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    dataset = OracleDataset(
+        data_fetcher=data_fetcher,
+        horizons_seconds=horizons,
+        quantiles=quantiles,
+        max_samples=args.max_samples,
+        ohlc_stats_path=args.ohlc_stats_path,
+        t_cutoff_seconds=int(args.t_cutoff_seconds)
+    )
+    if len(dataset) == 0:
+        raise RuntimeError("Dataset is empty.")
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=bool(args.shuffle),
+        num_workers=int(args.num_workers),
+        pin_memory=bool(args.pin_memory),
+        collate_fn=lambda batch: filtered_collate(collator, batch)
+    )
+    # --- 3. Model Init ---
+    logger.info("Initializing Oracle Model...")
+    model = Oracle(
+        token_encoder=token_encoder,
+        wallet_encoder=wallet_encoder,
+        graph_updater=graph_updater,
+        ohlc_embedder=ohlc_embedder,
+        time_encoder=time_encoder,
+        num_event_types=vocab.NUM_EVENT_TYPES,
+        multi_modal_dim=multi_modal_encoder.embedding_dim,
+        event_pad_id=vocab.EVENT_TO_ID["__PAD__"],
+        event_type_to_id=vocab.EVENT_TO_ID,
+        model_config_name="Qwen/Qwen3-0.6B",
+        quantiles=quantiles,
+        horizons_seconds=horizons,
+        dtype=init_dtype
+    )
+    # Memory Optimization: Delete unused embedding layer from Qwen backbone
+    if hasattr(model.model, 'embed_tokens'):
+        del model.model.embed_tokens
+        logger.info("Freed unused Qwen embedding layer memory.")
+    # --- 4. Optimizer & Scheduler ---
+    optimizer = AdamW(model.parameters(), lr=learning_rate)
+    # Calculate training steps
+    num_update_steps_per_epoch = math.ceil(len(dataloader) / grad_accum_steps)
+    max_train_steps = epochs * num_update_steps_per_epoch
+    num_warmup_steps = int(max_train_steps * warmup_ratio)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=max_train_steps
+    )
+    # --- 5. Accelerate Prepare ---
+    model, optimizer, dataloader, scheduler = accelerator.prepare(
+        model, optimizer, dataloader, scheduler
+    )
+    # --- 6. Resume Training Logic ---
+    # Load checkpoint if it exists
+    starting_epoch = 0
+    resume_step = 0
+    # Check for existing checkpoints
+    if checkpoint_dir.exists():
+        # Look for subfolders named 'checkpoint-X' or 'epoch_X'
+        # Accelerate saves to folders.
+        dirs = [d for d in checkpoint_dir.iterdir() if d.is_dir()]
+        if dirs:
+            # Sort by modification time or name to find latest
+            dirs.sort(key=lambda x: x.stat().st_mtime)
+            latest_checkpoint = dirs[-1]
+            logger.info(f"Found checkpoint: {latest_checkpoint}. Resuming training...")
+            accelerator.load_state(str(latest_checkpoint))
+            # Try to infer epoch/step from folder name or saved state if custom tracking
+            # Accelerate restores DataLoader state, so we mainly need to know where we are for logging
+            # Assuming standard naming or just relying on DataLoader restore.
+            # Simple approach: Just trust Accelerate/DataLoader to skip.
+            # If you need precise epoch/step recovery for logging display:
+            # You could save a metadata.json inside the checkpoint folder.
+            logger.info("Checkpoint loaded. DataLoader state restored.")
+    else:
+        logger.info("No checkpoint found. Starting fresh.")
+    # --- 7. Training Loop ---
+    total_steps = 0
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(dataset)}")
+    logger.info(f"  Num Epochs = {epochs}")
+    logger.info(f"  Instantaneous batch size per device = {batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {batch_size * accelerator.num_processes * grad_accum_steps}")
+    logger.info(f"  Gradient Accumulation steps = {grad_accum_steps}")
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+    for epoch in range(starting_epoch, epochs):
+        model.train()
+        epoch_loss = 0.0
+        valid_batches = 0
+        # Tqdm only on main process
+        progress_bar = tqdm(
+            dataloader,
+            desc=f"Epoch {epoch+1}/{epochs}",
+            disable=not accelerator.is_local_main_process,
+            initial=resume_step # If you calculate resume_step from checkpoint
+        )
+        for step, batch in enumerate(progress_bar):
+            # Skip steps if resuming (Accelerate dataloader might handle this automatically if configured,
+            # but 'skip_first_batches' is often manual.
+            # For simplicity here, we assume load_state restored the dataloader iterator.)
+            if batch is None:
+                continue
+            # Safety Patch for missing social data
+            if 'textual_event_indices' not in batch:
+                B, L = batch['event_type_ids'].shape
+                batch['textual_event_indices'] = torch.zeros((B, L), dtype=torch.long, device=accelerator.device)
+            if 'textual_event_data' not in batch:
+                batch['textual_event_data'] = []
+            grad_stats: Optional[Dict[str, float]] = None
+            module_grad_stats: Dict[str, float] = {}
+            with accelerator.accumulate(model):
+                outputs = model(batch)
+                preds = outputs["quantile_logits"]
+                labels = batch["labels"]
+                labels_mask = batch["labels_mask"]
+                if labels_mask.sum() == 0:
+                    loss = torch.tensor(0.0, requires_grad=True, device=accelerator.device)
+                else:
+                    loss = quantile_pinball_loss(preds, labels, labels_mask, quantiles)
+                accelerator.backward(loss)
+                if accelerator.sync_gradients:
+                    accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)
+                    grad_stats, module_grad_stats = compute_gradient_stats(model)
+                    if grad_stats and accelerator.is_main_process:
+                        logger.info(
+                            "Gradients - mean norm: %.4f | max norm: %.4f | max abs: %.4f",
+                            grad_stats["grad_norm_mean"],
+                            grad_stats["grad_norm_max"],
+                            grad_stats["grad_abs_max"],
+                        )
+                        if module_grad_stats:
+                            module_entries = " | ".join(
+                                f"{name}: {norm:.4f}" for name, norm in sorted(module_grad_stats.items())
+                            )
+                            logger.info("Per-module grad norms: %s", module_entries)
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+            # Logging
+            if accelerator.sync_gradients:
+                total_steps += 1
+                current_loss = loss.item()
+                epoch_loss += current_loss
+                valid_batches += 1
+                if total_steps % log_every == 0:
+                    lr = scheduler.get_last_lr()[0]
+                    log_payload = {
+                        "train/loss": current_loss,
+                        "train/learning_rate": lr,
+                        "train/epoch": epoch + (step / len(dataloader))
+                    }
+                    if grad_stats:
+                        log_payload.update({
+                            "train/grad_norm_mean": grad_stats["grad_norm_mean"],
+                            "train/grad_norm_max": grad_stats["grad_norm_max"],
+                            "train/grad_abs_max": grad_stats["grad_abs_max"],
+                        })
+                    accelerator.log(log_payload, step=total_steps)
+                    if accelerator.is_main_process:
+                        progress_bar.set_postfix({"loss": f"{current_loss:.4f}", "lr": f"{lr:.2e}"})
+                        if grad_stats:
+                            logger.info(
+                                "Step %d | loss %.4f | grad_norm %.4f",
+                                total_steps,
+                                current_loss,
+                                grad_stats["grad_norm_mean"],
+                            )
+                # Save Checkpoint periodically
+                if total_steps % save_every == 0:
+                    if accelerator.is_main_process:
+                        save_path = checkpoint_dir / f"checkpoint-{total_steps}"
+                        accelerator.save_state(output_dir=str(save_path))
+                        logger.info(f"Saved checkpoint to {save_path}")
+        # End of Epoch Handling
+        if valid_batches > 0:
+            avg_loss = epoch_loss / valid_batches
+            if accelerator.is_main_process:
+                logger.info(f"Epoch {epoch+1} complete. Avg loss: {avg_loss:.6f}")
+                accelerator.log({"train/loss_epoch": avg_loss}, step=global_step)
+                # Save Checkpoint at end of epoch
+                save_path = checkpoint_dir / f"epoch_{epoch+1}"
+                accelerator.save_state(output_dir=str(save_path))
+                logger.info(f"Saved checkpoint to {save_path}")
+        else:
+            if accelerator.is_main_process:
+                logger.warning(f"Epoch {epoch+1}: No valid batches processed.")
+    accelerator.end_training()
+    neo4j_driver.close()
+if __name__ == "__main__":
+    main()

train.sh ADDED Viewed

	@@ -0,0 +1,23 @@

+accelerate launch train.py \
+  --epochs 1 \
+  --batch_size 1 \
+  --learning_rate 1e-4 \
+  --warmup_ratio 0.1 \
+  --grad_accum_steps 1 \
+  --max_grad_norm 1.0 \
+  --seed 42 \
+  --log_every 1 \
+  --save_every 1000 \
+  --tensorboard_dir runs/oracle \
+  --checkpoint_dir checkpoints \
+  --mixed_precision bf16 \
+  --max_seq_len 50 \
+  --ohlc_seq_len 300 \
+  --horizons_seconds 30 60 120 240 420 \
+  --quantiles 0.1 0.5 0.9 \
+  --ohlc_stats_path ./data/ohlc_stats.npz \
+  --t_cutoff_seconds 60 \
+  --num_workers 4 \
+  --clickhouse_host localhost \
+  --clickhouse_port 9000 \
+  --neo4j_uri bolt://localhost:7687

train.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+training:
+  epochs: 1
+  batch_size: 1
+  learning_rate: 5.0e-05
+  use_amp: true
+  log_every: 1
+  disable_tqdm: false
+  tensorboard_logdir: runs/oracle
+  checkpoint_path: checkpoints/oracle_checkpoint.pt
+data:
+  max_samples: null
+  horizons_seconds: [30, 60, 120, 240, 420]
+  quantiles: [0.1, 0.5, 0.9]
+  max_seq_len: 50
+  ohlc_seq_len: 300
+  ohlc_stats_path: ./data/ohlc_stats.npz
+  t_cutoff_seconds: 60
+  shuffle: true
+  num_workers: 0
+  pin_memory: false
+databases:
+  clickhouse:
+    host: localhost
+    port: 9000
+  neo4j:
+    uri: bolt://localhost:7687
+    user: null
+    password: null

utils.sql ADDED Viewed

	@@ -0,0 +1,69 @@

+OPTIMIZE TABLE wallet_profiles FINAL;
+OPTIMIZE TABLE wallet_profile_metrics_latest FINAL;
+OPTIMIZE TABLE wallet_holdings_latest FINAL;
+OPTIMIZE TABLE tokens_latest FINAL;
+OPTIMIZE TABLE token_metrics_latest FINAL;
+TRUNCATE TABLE wallet_holdings;
+TRUNCATE TABLE trades;
+TRUNCATE TABLE transfers;
+TRUNCATE TABLE burns;
+TRUNCATE TABLE tokens;
+TRUNCATE TABLE mints;
+TRUNCATE TABLE liquidity;
+TRUNCATE TABLE pool_creations;
+TRUNCATE TABLE token_metrics;
+TRUNCATE TABLE wallet_profile_metrics;
+TRUNCATE TABLE migrations;
+TRUNCATE TABLE fee_collections;
+TRUNCATE TABLE supply_locks;
+TRUNCATE TABLE supply_lock_actions;
+TRUNCATE TABLE wallet_profile_metrics_latest;
+TRUNCATE TABLE wallet_holdings_latest;
+TRUNCATE TABLE token_metrics_latest;
+TRUNCATE TABLE tokens_latest;
+TRUNCATE TABLE wallet_profiles;
+DROP TABLE IF EXISTS trades;
+DROP TABLE IF EXISTS mints;
+DROP TABLE IF EXISTS migrations;
+DROP TABLE IF EXISTS fee_collections;
+DROP TABLE IF EXISTS liquidity;
+DROP TABLE IF EXISTS pool_creations;
+DROP TABLE IF EXISTS transfers;
+DROP TABLE IF EXISTS burns;
+DROP TABLE IF EXISTS wallet_profiles;
+DROP TABLE IF EXISTS wallet_holdings;
+DROP TABLE IF EXISTS wallet_profile_metrics;
+DROP TABLE IF EXISTS wallet_profile_metrics_latest;
+DROP TABLE IF EXISTS tokens;
+DROP TABLE IF EXISTS token_metrics;
+DROP TABLE IF EXISTS token_metrics_latest;
+DROP TABLE IF EXISTS supply_locks;
+DROP TABLE IF EXISTS supply_lock_actions;
+DROP TABLE IF EXISTS wallet_holdings_latest;
+DROP TABLE IF EXISTS tokens_latest;
+-- Backfilling Logic
+CREATE TABLE IF NOT EXISTS tokens_backfill
+(
+  token_address     String,
+  name              String,
+  symbol            String,
+  token_uri         String,
+  is_mutable        UInt8,
+  update_authority  Nullable(String),
+  mint_authority    Nullable(String),
+  freeze_authority  Nullable(String),
+  protocol          UInt8
+)
+ENGINE = MergeTree
+ORDER BY token_address;

validate.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import argparse
+import yaml
+import torch
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from clickhouse_driver import Client as ClickHouseClient
+from neo4j import GraphDatabase
+from data.data_fetcher import DataFetcher
+from data.data_loader import OracleDataset
+from data.data_collator import MemecoinCollator
+from models.multi_modal_processor import MultiModalEncoder
+from models.helper_encoders import ContextualTimeEncoder
+from models.token_encoder import TokenEncoder
+from models.wallet_encoder import WalletEncoder
+from models.graph_updater import GraphUpdater
+from models.ohlc_embedder import OHLCEmbedder
+from models.model import Oracle
+import models.vocabulary as vocab
+def quantile_pinball_loss(preds: torch.Tensor,
+                          targets: torch.Tensor,
+                          mask: torch.Tensor,
+                          quantiles: List[float]) -> torch.Tensor:
+    if mask.sum() == 0:
+        return torch.tensor(0.0, device=preds.device, dtype=preds.dtype)
+    num_q = len(quantiles)
+    losses = []
+    for idx, q in enumerate(quantiles):
+        pred_slice = preds[:, idx::num_q]
+        target_slice = targets[:, idx::num_q]
+        mask_slice = mask[:, idx::num_q]
+        diff = target_slice - pred_slice
+        pinball = torch.maximum((q - 1.0) * diff, q * diff)
+        losses.append((pinball * mask_slice).sum())
+    return sum(losses) / mask.sum().clamp_min(1.0)
+def load_config(path: str) -> Dict[str, Any]:
+    cfg_path = Path(path)
+    if not cfg_path.exists():
+        raise FileNotFoundError(f"Config file not found: {cfg_path}")
+    with cfg_path.open("r") as handle:
+        return yaml.safe_load(handle) or {}
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Validate Oracle checkpoint on a single token.")
+    parser.add_argument("--config", type=str, default="train.yaml", help="Path to training YAML config.")
+    parser.add_argument("--checkpoint", type=str, default=None, help="Checkpoint to load. Defaults to config training.checkpoint_path.")
+    parser.add_argument("--sample-idx", type=int, default=0, help="Dataset index to validate.")
+    parser.add_argument("--token-address", type=str, default=None, help="Optional mint address to pick instead of index.")
+    parser.add_argument("--t-cutoff-seconds", type=int, default=None, help="Override cutoff horizon (seconds after mint).")
+    return parser.parse_args()
+def resolve_sample_index(dataset: OracleDataset,
+                         sample_idx: int,
+                         token_address: Optional[str]) -> int:
+    if token_address:
+        for idx, mint in enumerate(getattr(dataset, "sampled_mints", [])):
+            if mint.get("mint_address") == token_address:
+                return idx
+        raise ValueError(f"Token {token_address} not found in loaded dataset.")
+    if sample_idx < 0 or sample_idx >= len(dataset):
+        raise ValueError(f"Sample index {sample_idx} out of range (len={len(dataset)}).")
+    return sample_idx
+def move_to_device(batch: Dict[str, Any], device: torch.device) -> Dict[str, Any]:
+    for key, value in list(batch.items()):
+        if torch.is_tensor(value):
+            batch[key] = value.to(device)
+    return batch
+def main() -> None:
+    args = parse_args()
+    config = load_config(args.config)
+    training_cfg = config.get("training", {})
+    data_cfg = config.get("data", {})
+    db_cfg = config.get("databases", {})
+    checkpoint_path = Path(args.checkpoint or training_cfg.get("checkpoint_path", "checkpoints/oracle_checkpoint.pt")).expanduser()
+    if not checkpoint_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.bfloat16 if device.type == "cuda" and torch.cuda.is_bf16_supported() else torch.float16
+    if device.type == "cpu":
+        dtype = torch.float32
+    quantiles = data_cfg.get("quantiles", [0.1, 0.5, 0.9])
+    horizons = data_cfg.get("horizons_seconds", [30, 60, 120, 240, 420])
+    max_samples = data_cfg.get("max_samples", None)
+    max_seq_len = data_cfg.get("max_seq_len", 50)
+    ohlc_seq_len = data_cfg.get("ohlc_seq_len", 60)
+    default_t_cutoff = int(data_cfg.get("t_cutoff_seconds", 60))
+    t_cutoff_seconds = int(args.t_cutoff_seconds) if args.t_cutoff_seconds is not None else default_t_cutoff
+    ohlc_stats_path = data_cfg.get("ohlc_stats_path", "./data/ohlc_stats.npz")
+    multi_modal_encoder = MultiModalEncoder(dtype=dtype)
+    time_encoder = ContextualTimeEncoder(dtype=dtype)
+    token_encoder = TokenEncoder(multi_dim=multi_modal_encoder.embedding_dim, dtype=dtype)
+    wallet_encoder = WalletEncoder(encoder=multi_modal_encoder, dtype=dtype)
+    graph_updater = GraphUpdater(time_encoder=time_encoder, dtype=dtype)
+    ohlc_embedder = OHLCEmbedder(
+        num_intervals=vocab.NUM_OHLC_INTERVALS,
+        sequence_length=ohlc_seq_len,
+        dtype=dtype
+    )
+    collator = MemecoinCollator(
+        event_type_to_id=vocab.EVENT_TO_ID,
+        device=device,
+        multi_modal_encoder=multi_modal_encoder,
+        dtype=dtype,
+        ohlc_seq_len=ohlc_seq_len,
+        max_seq_len=max_seq_len
+    )
+    clickhouse_cfg = db_cfg.get("clickhouse", {})
+    clickhouse_client = ClickHouseClient(
+        host=clickhouse_cfg.get("host", "localhost"),
+        port=int(clickhouse_cfg.get("port", 9000))
+    )
+    neo4j_cfg = db_cfg.get("neo4j", {})
+    neo4j_auth = None
+    if neo4j_cfg.get("user") is not None:
+        neo4j_auth = (neo4j_cfg.get("user"), neo4j_cfg.get("password") or "")
+    neo4j_driver = GraphDatabase.driver(neo4j_cfg.get("uri", "bolt://localhost:7687"), auth=neo4j_auth)
+    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    dataset = OracleDataset(
+        data_fetcher=data_fetcher,
+        horizons_seconds=horizons,
+        quantiles=quantiles,
+        max_samples=max_samples,
+        ohlc_stats_path=ohlc_stats_path,
+        token_allowlist=[args.token_address] if args.token_address else None,
+        t_cutoff_seconds=t_cutoff_seconds
+    )
+    if len(dataset) == 0:
+        raise RuntimeError("Dataset is empty; cannot validate.")
+    sample_idx = resolve_sample_index(dataset, args.sample_idx, args.token_address)
+    sample = dataset[sample_idx]
+    if sample is None:
+        raise RuntimeError(f"Dataset returned None for sample index {sample_idx}.")
+    token_address = getattr(dataset, "sampled_mints", [{}])[sample_idx].get("mint_address", "Unknown")
+    print(f"Validating token {token_address} (dataset idx {sample_idx}) with T_cutoff {t_cutoff_seconds} second(s) after mint")
+    collated = collator([sample])
+    collated = move_to_device(collated, device)
+    model = Oracle(
+        token_encoder=token_encoder,
+        wallet_encoder=wallet_encoder,
+        graph_updater=graph_updater,
+        ohlc_embedder=ohlc_embedder,
+        time_encoder=time_encoder,
+        num_event_types=vocab.NUM_EVENT_TYPES,
+        multi_modal_dim=multi_modal_encoder.embedding_dim,
+        event_pad_id=vocab.EVENT_TO_ID["__PAD__"],
+        event_type_to_id=vocab.EVENT_TO_ID,
+        quantiles=quantiles,
+        horizons_seconds=horizons,
+        dtype=dtype
+    ).to(device)
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+    with torch.no_grad():
+        outputs = model(collated)
+    preds = outputs["quantile_logits"]
+    labels = collated["labels"]
+    labels_mask = collated["labels_mask"]
+    loss = quantile_pinball_loss(preds, labels, labels_mask, quantiles).item()
+    print(f"Pinball loss (masked): {loss:.6f}")
+    B = preds.shape[0]
+    grid = preds.view(B, len(horizons), len(quantiles))
+    label_grid = labels.view(B, len(horizons), len(quantiles))
+    mask_grid = labels_mask.view(B, len(horizons), len(quantiles))
+    for b in range(B):
+        print(f"\nSample {b} predictions:")
+        for h_idx, horizon in enumerate(horizons):
+            pred_row = grid[b, h_idx]
+            label_row = label_grid[b, h_idx]
+            mask_row = mask_grid[b, h_idx]
+            row_str = ", ".join(
+                f"q={quantiles[q_idx]:.2f}: pred={pred_row[q_idx].item():.6f}, "
+                f"label={label_row[q_idx].item():.6f}, mask={int(mask_row[q_idx].item())}"
+                for q_idx in range(len(quantiles))
+            )
+            print(f"  Horizon {horizon:>4}s -> {row_str}")
+    neo4j_driver.close()
+if __name__ == "__main__":
+    main()

validate.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+python validate.py \
+--config train.yaml \
+--checkpoint checkpoints/oracle_checkpoint.pt \
+--t-cutoff-seconds 240 \
+--token-address 'czaE9hrSWJ6g21bxS6qh9GbbczoRa5F5Lx2eo1apump'