oracle / data /data_collator.py

Upload folder using huggingface_hub

d195287 verified 25 days ago

46.5 kB

	# memecoin_collator.py (CORRECTED ORDER OF OPERATIONS)

	import torch
	import torch.nn.functional as F
	from torch.nn.utils.rnn import pad_sequence
	from typing import List, Dict, Any, Tuple, Optional, Union
	from collections import defaultdict
	from PIL import Image
	# --- GLOBAL SINGLETON FOR WORKER PROCESSES REMOVED ---

	import models.vocabulary as vocab
	from data.data_loader import EmbeddingPooler
	from data.quant_ohlc_feature_schema import FEATURE_VERSION, FEATURE_VERSION_ID, NUM_QUANT_OHLC_FEATURES, TOKENS_PER_SEGMENT

	NATIVE_MINT = "So11111111111111111111111111111111111111112"
	QUOTE_MINTS = {
	NATIVE_MINT, # SOL
	"EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", # USDC
	"Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB", # USDT
	"USD1ttGY1N17NEEHLmELoaybftRBUSErhqYiQzvEmuB", # USD1
	}

	class MemecoinCollator:
	"""
	Callable class for PyTorch DataLoader's collate_fn.
	... (rest of docstring) ...
	"""
	def __init__(self,
	event_type_to_id: Dict[str, int],
	device: torch.device,
	dtype: torch.dtype,
	max_seq_len: Optional[int] = None,
	model_id: str = "google/siglip-so400m-patch16-256-i18n"
	):
	self.event_type_to_id = event_type_to_id
	self.pad_token_id = event_type_to_id.get('__PAD__', 0)
	# self.multi_modal_encoder = multi_modal_encoder # DEPRECATED
	self.model_id = model_id
	self.entity_pad_idx = 0

	self.device = device
	self.dtype = dtype
	self.ohlc_seq_len = 300 # HARDCODED
	self.quant_ohlc_tokens = TOKENS_PER_SEGMENT
	self.quant_ohlc_num_features = NUM_QUANT_OHLC_FEATURES
	self.max_seq_len = max_seq_len

	def _collate_features_for_encoder(self, entities: List[Dict], feature_keys: List[str], device: torch.device, entity_type: str) -> Dict[str, Any]:
	""" (Unchanged) """
	collated = defaultdict(list)
	if not entities:
	# --- FIXED: Return a default empty structure for BOTH tokens and wallets ---
	if entity_type == "token":
	return {
	'name_embed_indices': torch.tensor([], device=device, dtype=torch.long),
	'symbol_embed_indices': torch.tensor([], device=device, dtype=torch.long),
	'image_embed_indices': torch.tensor([], device=device, dtype=torch.long),
	'protocol_ids': torch.tensor([], device=device, dtype=torch.long),
	'is_vanity_flags': torch.tensor([], device=device, dtype=torch.bool),
	'_addresses_for_lookup': []
	}
	elif entity_type == "wallet":
	return {
	'username_embed_indices': torch.tensor([], device=device, dtype=torch.long),
	'profile_rows': [], 'social_rows': [], 'holdings_batch': []
	}
	return {} # Should not happen

	# NEW: We now gather indices to pre-computed embeddings
	if entity_type == "token":
	# This indicates a Token entity
	# Helper key for WalletEncoder to find token vibes
	collated['_addresses_for_lookup'] = [e.get('address', '') for e in entities]
	collated['name_embed_indices'] = torch.tensor([e.get('name_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
	collated['symbol_embed_indices'] = torch.tensor([e.get('symbol_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
	collated['image_embed_indices'] = torch.tensor([e.get('image_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
	collated['protocol_ids'] = torch.tensor([e.get('protocol', 0) for e in entities], device=device, dtype=torch.long)
	collated['is_vanity_flags'] = torch.tensor([e.get('is_vanity', False) for e in entities], device=device, dtype=torch.bool)
	elif entity_type == "wallet":
	# NEW: Gather username indices for WalletEncoder
	collated['username_embed_indices'] = torch.tensor([e.get('socials', {}).get('username_emb_idx', 0) for e in entities], device=device, dtype=torch.long)
	collated['profile_rows'] = [e.get('profile', {}) for e in entities]
	collated['social_rows'] = [e.get('socials', {}) for e in entities]
	collated['holdings_batch'] = [e.get('holdings', []) for e in entities]
	return dict(collated)

	def _collate_ohlc_inputs(self, chart_events: List[Dict]) -> Dict[str, torch.Tensor]:
	""" (Unchanged from previous correct version) """
	if not chart_events:
	return {
	'price_tensor': torch.empty(0, 2, self.ohlc_seq_len, device=self.device, dtype=self.dtype),
	'interval_ids': torch.empty(0, device=self.device, dtype=torch.long),
	'quant_feature_tensors': torch.empty(0, self.quant_ohlc_tokens, self.quant_ohlc_num_features, device=self.device, dtype=self.dtype),
	'quant_feature_mask': torch.empty(0, self.quant_ohlc_tokens, device=self.device, dtype=self.dtype),
	'quant_feature_version_ids': torch.empty(0, device=self.device, dtype=torch.long),
	}
	ohlc_tensors = []
	interval_ids_list = []
	quant_feature_tensors = []
	quant_feature_masks = []
	quant_feature_version_ids = []
	seq_len = self.ohlc_seq_len
	unknown_id = vocab.INTERVAL_TO_ID.get("Unknown", 0)
	for segment_data in chart_events:
	opens = segment_data.get('opens', [])
	closes = segment_data.get('closes', [])
	interval_str = segment_data.get('i', "Unknown")
	pad_open = opens[-1] if opens else 0
	pad_close = closes[-1] if closes else 0
	o = torch.tensor(opens[:seq_len] + [pad_open]*(seq_len-len(opens)), dtype=self.dtype)
	c = torch.tensor(closes[:seq_len] + [pad_close]*(seq_len-len(closes)), dtype=self.dtype)
	o = torch.nan_to_num(o, nan=0.0, posinf=0.0, neginf=0.0)
	c = torch.nan_to_num(c, nan=0.0, posinf=0.0, neginf=0.0)
	ohlc_tensors.append(torch.stack([o, c]))
	interval_id = vocab.INTERVAL_TO_ID.get(interval_str, unknown_id)
	interval_ids_list.append(interval_id)
	quant_payload = segment_data.get('quant_ohlc_features')
	if quant_payload is None:
	raise RuntimeError("Chart_Segment missing quant_ohlc_features. Rebuild cache with quantitative chart features.")
	if not isinstance(quant_payload, list):
	raise RuntimeError("Chart_Segment quant_ohlc_features must be a list.")
	feature_rows = []
	feature_mask = []
	for token_idx in range(self.quant_ohlc_tokens):
	if token_idx < len(quant_payload):
	payload = quant_payload[token_idx]
	vec = payload.get('feature_vector')
	if not isinstance(vec, list) or len(vec) != self.quant_ohlc_num_features:
	raise RuntimeError(
	f"Chart_Segment quant feature vector must have length {self.quant_ohlc_num_features}."
	)
	feature_rows.append(vec)
	feature_mask.append(1.0)
	else:
	feature_rows.append([0.0] * self.quant_ohlc_num_features)
	feature_mask.append(0.0)
	quant_feature_tensors.append(torch.tensor(feature_rows, device=self.device, dtype=self.dtype))
	quant_feature_masks.append(torch.tensor(feature_mask, device=self.device, dtype=self.dtype))
	version = segment_data.get('quant_feature_version', FEATURE_VERSION)
	quant_feature_version_ids.append(FEATURE_VERSION_ID if version == FEATURE_VERSION else 0)
	return {
	'price_tensor': torch.stack(ohlc_tensors).to(self.device),
	'interval_ids': torch.tensor(interval_ids_list, device=self.device, dtype=torch.long),
	'quant_feature_tensors': torch.stack(quant_feature_tensors).to(self.device),
	'quant_feature_mask': torch.stack(quant_feature_masks).to(self.device),
	'quant_feature_version_ids': torch.tensor(quant_feature_version_ids, device=self.device, dtype=torch.long),
	}

	def _collate_graph_links(self,
	batch_items: List[Dict],
	wallet_addr_to_batch_idx: Dict[str, int],
	token_addr_to_batch_idx: Dict[str, int]) -> Dict[str, Any]:
	""" (Unchanged) """
	aggregated_links = defaultdict(lambda: {'edge_index_list': [], 'links_list': []})
	for item in batch_items:
	item_wallets = item.get('wallets', {})
	item_tokens = item.get('tokens', {})
	item_wallet_addr_to_global_idx = {addr: wallet_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_wallets.keys()}
	item_token_addr_to_global_idx = {addr: token_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_tokens.keys()}
	for link_name, data in item.get('graph_links', {}).items():
	# aggregated_links[link_name]['links_list'].extend(data.get('links', [])) - REMOVED: Now handled inside the loop for sync
	triplet = vocab.LINK_NAME_TO_TRIPLET.get(link_name)
	if not triplet: continue
	src_type, _, dst_type = triplet
	edges = data.get('edges')
	link_props_list = data.get('links', [])
	if not edges or not link_props_list: continue

	src_map = item_wallet_addr_to_global_idx if src_type == 'wallet' else item_token_addr_to_global_idx
	dst_map = item_wallet_addr_to_global_idx if dst_type == 'wallet' else item_token_addr_to_global_idx

	remapped_edge_list = []
	valid_link_props = []

	for (src_addr, dst_addr), props in zip(edges, link_props_list):
	src_idx_global = src_map.get(src_addr, self.entity_pad_idx)
	dst_idx_global = dst_map.get(dst_addr, self.entity_pad_idx)

	if src_idx_global != self.entity_pad_idx and dst_idx_global != self.entity_pad_idx:
	remapped_edge_list.append([src_idx_global, dst_idx_global])
	valid_link_props.append(props)

	if remapped_edge_list:
	remapped_edge_tensor = torch.tensor(remapped_edge_list, device=self.device, dtype=torch.long).t()
	aggregated_links[link_name]['edge_index_list'].append(remapped_edge_tensor)
	aggregated_links[link_name]['links_list'].extend(valid_link_props)
	if link_name == "TransferLink":
	link_props = data.get('links', [])
	derived_edges = []
	derived_props = []
	for (src_addr, dst_addr), props in zip(edges, link_props):
	mint_addr = props.get('mint')
	if not mint_addr or mint_addr in QUOTE_MINTS:
	continue
	token_idx_global = item_token_addr_to_global_idx.get(mint_addr, self.entity_pad_idx)
	if token_idx_global == self.entity_pad_idx:
	continue
	for wallet_addr in (src_addr, dst_addr):
	wallet_idx_global = item_wallet_addr_to_global_idx.get(wallet_addr, self.entity_pad_idx)
	if wallet_idx_global == self.entity_pad_idx:
	continue
	derived_edges.append([wallet_idx_global, token_idx_global])
	derived_props.append(props)
	if derived_edges:
	derived_tensor = torch.tensor(derived_edges, device=self.device, dtype=torch.long).t()
	aggregated_links["TransferLinkToken"]['edge_index_list'].append(derived_tensor)
	aggregated_links["TransferLinkToken"]['links_list'].extend(derived_props)
	final_links_dict = {}
	for link_name, data in aggregated_links.items():
	if data['edge_index_list']:
	final_links_dict[link_name] = {
	'links': data['links_list'],
	'edge_index': torch.cat(data['edge_index_list'], dim=1)
	}
	return final_links_dict

	def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
	"""
	Processes a batch of raw data items into tensors for the model.
	"""
	# --- NEW ARCHITECTURE ---
	# 1. Aggregate all unique embeddable items from the entire batch.
	# 2. Create a single embedding pool tensor for the whole batch.
	# 3. Create a mapping from original (per-item) indices to the new batch-wide indices.
	# 4. Remap all `_emb_idx` fields in the batch data using this new mapping.

	batch_size = len(batch)
	if batch_size == 0:
	return {}

	# --- 1. Aggregate all unique items and create index mappings ---
	batch_wide_pooler = EmbeddingPooler()
	# Map to translate from an item's original pooler to the new batch-wide indices
	# Format: { batch_item_index: { original_idx: new_batch_idx } }
	idx_remap = defaultdict(dict)

	for i, item in enumerate(batch):
	pooler = item.get('embedding_pooler')
	if not pooler: continue

	for pool_item_data in pooler.get_all_items():
	original_idx = pool_item_data['idx']
	raw_item = pool_item_data['item']
	# get_idx will either return an existing index or create a new one
	# --- FIX: Convert 1-based pooler index to 0-based tensor index ---
	new_batch_idx_1_based = batch_wide_pooler.get_idx(raw_item)
	new_batch_idx_0_based = new_batch_idx_1_based - 1
	idx_remap[i][original_idx] = new_batch_idx_0_based

	# --- 2. Create the single, batch-wide embedding pool tensor ---
	all_items_sorted = batch_wide_pooler.get_all_items()

	if not all_items_sorted:
	# Handle edge case of absolutely no embeddings in batch
	# Create a dummy empty tensor
	batch_embedding_pool = torch.empty(0, 768, device=self.device, dtype=self.dtype) # Default SigLIP dim is 1152 actually, but standard is 768. Better to infer or default.
	# Actually, if empty, it doesn't matter much as long as it's not accessed.
	else:
	first_item = all_items_sorted[0]['item']
	if not isinstance(first_item, torch.Tensor):
	raise RuntimeError(f"Collator expects pre-computed embeddings (torch.Tensor), found {type(first_item)}. Please rebuild cache.")

	# Stack all embeddings
	# They should already be CPU tensors from the loader
	# Move to device and cast to dtype
	batch_embedding_pool = torch.stack([d['item'] for d in all_items_sorted]).to(device=self.device, dtype=self.dtype)
	batch_embedding_pool = torch.nan_to_num(batch_embedding_pool, nan=0.0, posinf=0.0, neginf=0.0)

	# --- 3. Remap all indices in the batch data ---
	for i, item in enumerate(batch):
	remap_dict = idx_remap.get(i, {})
	if not remap_dict: continue

	# Remap tokens
	for token_data in item.get('tokens', {}).values():
	for key in ['name_emb_idx', 'symbol_emb_idx', 'image_emb_idx']:
	if token_data.get(key, 0) > 0: # Check if it has a valid 1-based index
	token_data[key] = remap_dict.get(token_data[key], -1) # Remap to 0-based, default to -1 if not found
	# Remap wallets
	for wallet_data in item.get('wallets', {}).values():
	socials = wallet_data.get('socials', {})
	if socials.get('username_emb_idx', 0) > 0:
	socials['username_emb_idx'] = remap_dict.get(socials['username_emb_idx'], -1)
	# Remap events
	for event in item.get('event_sequence', []):
	for key in event:
	if key.endswith('_emb_idx') and event.get(key, 0) > 0:
	event[key] = remap_dict.get(event[key], 0)

	# --- 4. Standard Collation (Now that indices are correct) ---
	unique_wallets_data = {}
	unique_tokens_data = {}
	all_event_sequences = []
	max_len = 0

	for item in batch:
	seq = item.get('event_sequence', [])
	if self.max_seq_len is not None and len(seq) > self.max_seq_len:
	seq = seq[:self.max_seq_len]
	all_event_sequences.append(seq)
	max_len = max(max_len, len(seq))
	unique_wallets_data.update(item.get('wallets', {}))
	unique_tokens_data.update(item.get('tokens', {}))

	# Create mappings needed for indexing (use dict keys as source of truth)
	wallet_items = list(unique_wallets_data.items())
	token_items = list(unique_tokens_data.items())

	wallet_list_data = []
	for addr, feat in wallet_items:
	profile = feat.get('profile', {})
	if not profile.get('wallet_address'):
	profile['wallet_address'] = addr
	wallet_list_data.append(feat)

	token_list_data = []
	for addr, feat in token_items:
	if not feat.get('address'):
	feat['address'] = addr
	token_list_data.append(feat)

	wallet_addr_to_batch_idx = {addr: i + 1 for i, (addr, _) in enumerate(wallet_items)}
	token_addr_to_batch_idx = {addr: i + 1 for i, (addr, _) in enumerate(token_items)}

	# Collate Static Raw Features (Tokens, Wallets, Graph)
	token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
	# Collate Static Raw Features (Tokens, Wallets, Graph)
	token_encoder_inputs = self._collate_features_for_encoder(token_list_data, ['name'], self.device, "token")
	wallet_encoder_inputs = self._collate_features_for_encoder(wallet_list_data, ['profile'], self.device, "wallet")
	graph_updater_links = self._collate_graph_links(batch, wallet_addr_to_batch_idx, token_addr_to_batch_idx)

	# --- 5. Prepare Sequence Tensors & Collect Dynamic Data (OHLC) ---
	B = batch_size
	L = max_len
	PAD_IDX_SEQ = self.pad_token_id
	PAD_IDX_ENT = self.entity_pad_idx

	# Initialize sequence tensors
	event_type_ids = torch.full((B, L), PAD_IDX_SEQ, dtype=torch.long, device=self.device)
	# Use float64 to preserve second-level precision for large Unix timestamps.
	timestamps_float = torch.zeros((B, L), dtype=torch.float64, device=self.device)
	# Store relative_ts in float32 for stability; model will scale/log/normalize
	relative_ts = torch.zeros((B, L, 1), dtype=torch.float32, device=self.device)
	attention_mask = torch.zeros((B, L), dtype=torch.long, device=self.device)
	wallet_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	ohlc_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	quote_token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device) # NEW

	# --- NEW: Tensors for Transfer/LargeTransfer ---
	dest_wallet_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	# --- NEW: Separate tensor for social media original authors ---
	original_author_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	# 4 numerical features for transfers
	transfer_numerical_features = torch.zeros((B, L, 4), dtype=self.dtype, device=self.device)

	# --- NEW: Tensors for Trade ---
	# --- FIXED: Size reduced from 10 to 8 ---
	trade_numerical_features = torch.zeros((B, L, 8), dtype=self.dtype, device=self.device)
	deployer_trade_numerical_features = torch.zeros((B, L, 8), dtype=self.dtype, device=self.device)
	smart_wallet_trade_numerical_features = torch.zeros((B, L, 8), dtype=self.dtype, device=self.device)
	# --- NEW: Dedicated tensor for categorical dex_platform_id ---
	trade_dex_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	# --- NEW: Dedicated tensor for categorical trade_direction ---
	trade_direction_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	# --- NEW: Dedicated tensor for categorical mev_protection ---
	trade_mev_protection_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	# --- NEW: Dedicated tensor for categorical is_bundle ---
	trade_is_bundle_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for PoolCreated ---
	# --- UPDATED: Capture raw base/quote deposit amounts only ---
	pool_created_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device)
	# --- NEW: Dedicated tensor for categorical protocol_id ---
	pool_created_protocol_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for LiquidityChange ---
	# --- UPDATED: Keep only the raw quote amount deposit/withdraw ---
	liquidity_change_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device)
	# --- NEW: Dedicated tensor for categorical change_type_id ---
	liquidity_change_type_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for FeeCollected ---
	fee_collected_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device) # sol_amount only
	# --- NEW: Tensors for TokenBurn ---
	token_burn_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # amount_pct, amount_tokens

	# --- NEW: Tensors for SupplyLock ---
	supply_lock_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # amount_pct, lock_duration

	# --- NEW: Tensors for OnChain_Snapshot ---
	onchain_snapshot_numerical_features = torch.zeros((B, L, 14), dtype=self.dtype, device=self.device)

	# --- NEW: Tensors for TrendingToken ---
	trending_token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	# --- FIXED: Size reduced from 3 to 1 after removing IDs ---
	trending_token_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device) # rank
	trending_token_source_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	trending_token_timeframe_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for BoostedToken ---
	boosted_token_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	boosted_token_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # total_boost_amount, rank

	# --- NEW: Tensors for DexBoost_Paid ---
	dexboost_paid_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # amount, total_amount_on_token

	# --- NEW: Tensors for DexProfile_Updated ---
	dexprofile_updated_flags = torch.zeros((B, L, 4), dtype=torch.float32, device=self.device) # Using float for easier projection

	# --- NEW: Tensors for Tracker Events ---
	alpha_group_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	channel_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	exchange_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for GlobalTrending Events ---
	global_trending_numerical_features = torch.zeros((B, L, 1), dtype=self.dtype, device=self.device) # rank

	# --- NEW: Tensors for ChainSnapshot ---
	chainsnapshot_numerical_features = torch.zeros((B, L, 2), dtype=self.dtype, device=self.device) # native_token_price_usd, gas_fee

	# --- NEW: Tensors for Lighthouse_Snapshot ---
	# --- FIXED: Size reduced from 7 to 5 after removing IDs ---
	lighthousesnapshot_numerical_features = torch.zeros((B, L, 5), dtype=self.dtype, device=self.device)
	lighthousesnapshot_protocol_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)
	lighthousesnapshot_timeframe_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for Migrated event ---
	migrated_protocol_ids = torch.full((B, L), 0, dtype=torch.long, device=self.device)

	# --- NEW: Tensors for HolderSnapshot ---
	# This will store the raw holder data for the Oracle to process
	holder_snapshot_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	holder_snapshot_raw_data_list = [] # List of lists of dicts

	# --- RENAMED: Generic tensors for any event with text/image features ---
	textual_event_data_list = [] # List of dicts with text/media indices
	textual_event_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	# --- NEW: Pointers for pre-encoded images ---
	image_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)
	original_post_image_indices = torch.full((B, L), PAD_IDX_ENT, dtype=torch.long, device=self.device)



	# --- CORRECTED: Initialize chart event collection here ---
	batch_chart_events = []
	chart_event_counter = 0

	# Loop through sequences to populate tensors and collect chart events
	for i, seq in enumerate(all_event_sequences):

	seq_len = len(seq)
	if seq_len == 0: continue
	attention_mask[i, :seq_len] = 1

	for j, event in enumerate(seq):
	# Populate basic sequence info
	event_type = event.get('event_type', '__PAD__')
	type_id = self.event_type_to_id.get(event_type, PAD_IDX_SEQ)
	event_type_ids[i, j] = type_id
	timestamps_float[i, j] = event.get('timestamp', 0)
	relative_ts[i, j, 0] = event.get('relative_ts', 0.0)

	# Populate pointer indices
	w_addr = event.get('wallet_address')
	if w_addr:
	wallet_indices[i, j] = wallet_addr_to_batch_idx.get(w_addr, PAD_IDX_ENT)
	t_addr = event.get('token_address')
	if t_addr:
	token_indices[i, j] = token_addr_to_batch_idx.get(t_addr, PAD_IDX_ENT)

	# If it's a chart event, collect it and record its index
	if event_type == 'Chart_Segment':
	batch_chart_events.append(event)
	ohlc_indices[i, j] = chart_event_counter + 1 # Use 1-based index
	chart_event_counter += 1

	elif event_type in ['Transfer', 'LargeTransfer']: # ADDED LargeTransfer
	# Get destination wallet index
	dest_w_addr = event.get('destination_wallet_address') # Assuming this key exists
	if dest_w_addr:
	dest_wallet_indices[i, j] = wallet_addr_to_batch_idx.get(dest_w_addr, PAD_IDX_ENT)

	# Get numerical features (use .get with default 0)
	num_feats = [
	event.get('token_amount', 0.0),
	event.get('transfer_pct_of_total_supply', 0.0),
	event.get('transfer_pct_of_holding', 0.0),
	event.get('priority_fee', 0.0)
	]
	transfer_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type in ['Trade', 'LargeTrade']:
	# Get numerical and categorical features for the trade
	trade_dex_ids[i, j] = event.get('dex_platform_id', 0)
	trade_direction_ids[i, j] = event.get('trade_direction', 0) # 0=buy, 1=sell
	trade_mev_protection_ids[i, j] = event.get('mev_protection', 0) # 0, 1, 2...
	trade_is_bundle_ids[i, j] = 1 if event.get('is_bundle') else 0 # 0=false, 1=true

	num_feats = [
	event.get('sol_amount', 0.0),
	event.get('priority_fee', 0.0),
	event.get('token_amount_pct_of_holding', 0.0),
	event.get('quote_amount_pct_of_holding', 0.0),
	event.get('slippage', 0.0),
	event.get('token_amount_pct_to_total_supply', 0.0), # REPLACED price_impact
	1.0 if event.get('success') else 0.0,
	event.get('total_usd', 0.0)
	]
	trade_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'Deployer_Trade':
	# Use the dedicated tensor for deployer trades
	trade_dex_ids[i, j] = event.get('dex_platform_id', 0)
	trade_direction_ids[i, j] = event.get('trade_direction', 0) # 0=buy, 1=sell
	trade_mev_protection_ids[i, j] = event.get('mev_protection', 0) # 0, 1, 2...
	trade_is_bundle_ids[i, j] = 1 if event.get('is_bundle') else 0 # 0=false, 1=true
	num_feats = [
	event.get('sol_amount', 0.0),
	event.get('priority_fee', 0.0),
	event.get('token_amount_pct_of_holding', 0.0),
	event.get('quote_amount_pct_of_holding', 0.0),
	event.get('slippage', 0.0),
	event.get('token_amount_pct_to_total_supply', 0.0), # REPLACED price_impact
	1.0 if event.get('success') else 0.0,
	event.get('total_usd', 0.0)
	]
	deployer_trade_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'SmartWallet_Trade':
	# Use the dedicated tensor for smart wallet trades
	trade_dex_ids[i, j] = event.get('dex_platform_id', 0)
	trade_direction_ids[i, j] = event.get('trade_direction', 0) # 0=buy, 1=sell
	trade_mev_protection_ids[i, j] = event.get('mev_protection', 0) # 0, 1, 2...
	trade_is_bundle_ids[i, j] = 1 if event.get('is_bundle') else 0 # 0=false, 1=true
	num_feats = [
	event.get('sol_amount', 0.0),
	event.get('priority_fee', 0.0),
	event.get('token_amount_pct_of_holding', 0.0),
	event.get('quote_amount_pct_of_holding', 0.0),
	event.get('slippage', 0.0),
	event.get('token_amount_pct_to_total_supply', 0.0), # REPLACED price_impact
	1.0 if event.get('success') else 0.0,
	event.get('total_usd', 0.0)
	]
	smart_wallet_trade_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'PoolCreated':
	# Get the quote token index
	quote_t_addr = event.get('quote_token_address')
	if quote_t_addr:
	quote_token_indices[i, j] = token_addr_to_batch_idx.get(quote_t_addr, PAD_IDX_ENT)

	pool_created_protocol_ids[i, j] = event.get('protocol_id', 0)
	# Get numerical features
	num_feats = [
	event.get('base_amount', 0.0),
	event.get('quote_amount', 0.0)
	]
	pool_created_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'LiquidityChange':
	# Get the quote token index
	quote_t_addr = event.get('quote_token_address')
	if quote_t_addr:
	quote_token_indices[i, j] = token_addr_to_batch_idx.get(quote_t_addr, PAD_IDX_ENT)

	liquidity_change_type_ids[i, j] = event.get('change_type_id', 0)
	# Get numerical features
	num_feats = [event.get('quote_amount', 0.0)]
	liquidity_change_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'FeeCollected':
	# This event has the recipient wallet plus a single numerical feature (SOL amount).
	num_feats = [
	event.get('sol_amount', 0.0)
	]
	fee_collected_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'TokenBurn':
	# This event has a wallet (handled by wallet_indices) and two numerical features.
	num_feats = [
	event.get('amount_pct_of_total_supply', 0.0),
	event.get('amount_tokens_burned', 0.0)
	]
	token_burn_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'SupplyLock':
	# This event has a wallet and two numerical features.
	num_feats = [
	event.get('amount_pct_of_total_supply', 0.0),
	event.get('lock_duration', 0.0)
	]
	supply_lock_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'OnChain_Snapshot':
	# This event is a global snapshot with 14 numerical features.
	num_feats = [
	event.get('total_holders', 0.0),
	event.get('smart_traders', 0.0),
	event.get('kols', 0.0),
	event.get('holder_growth_rate', 0.0),
	event.get('top_10_holder_pct', 0.0),
	event.get('sniper_holding_pct', 0.0),
	event.get('rat_wallets_holding_pct', 0.0),
	event.get('bundle_holding_pct', 0.0),
	event.get('current_market_cap', 0.0),
	event.get('volume', 0.0),
	event.get('buy_count', 0.0),
	event.get('sell_count', 0.0),
	event.get('total_txns', 0.0),
	event.get('global_fees_paid', 0.0)
	]
	onchain_snapshot_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'TrendingToken':
	# Get the trending token index
	trending_t_addr = event.get('token_address')
	if trending_t_addr:
	trending_token_indices[i, j] = token_addr_to_batch_idx.get(trending_t_addr, PAD_IDX_ENT)

	trending_token_source_ids[i, j] = event.get('list_source_id', 0)
	trending_token_timeframe_ids[i, j] = event.get('timeframe_id', 0)
	# --- FIXED: Invert rank so that 1 is the highest value ---
	# Get numerical/categorical features
	num_feats = [
	1.0 / event.get('rank', 1e9) # Use a large number for rank 0 or missing to make it small
	]
	trending_token_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'BoostedToken':
	# Get the boosted token index
	boosted_t_addr = event.get('token_address')
	if boosted_t_addr:
	boosted_token_indices[i, j] = token_addr_to_batch_idx.get(boosted_t_addr, PAD_IDX_ENT)

	# --- FIXED: Invert rank so that 1 is the highest value ---
	# Get numerical features
	num_feats = [
	event.get('total_boost_amount', 0.0),
	1.0 / event.get('rank', 1e9)
	]
	boosted_token_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)

	elif event_type == 'Migrated':
	migrated_protocol_ids[i, j] = event.get('protocol_id', 0)

	elif event_type == 'HolderSnapshot':
	# --- FIXED: Store raw holder data, not an index ---
	raw_holders = event.get('holders', [])
	holder_snapshot_raw_data_list.append(raw_holders)
	holder_snapshot_indices[i, j] = len(holder_snapshot_raw_data_list) # 1-based index to the list

	elif event_type == 'Lighthouse_Snapshot':
	lighthousesnapshot_protocol_ids[i, j] = event.get('protocol_id', 0)
	lighthousesnapshot_timeframe_ids[i, j] = event.get('timeframe_id', 0)
	num_feats = [
	event.get('total_volume', 0.0),
	event.get('total_transactions', 0.0),
	event.get('total_traders', 0.0),
	event.get('total_tokens_created', 0.0),
	event.get('total_migrations', 0.0)
	]
	lighthousesnapshot_numerical_features[i, j, :] = torch.as_tensor(num_feats, dtype=self.dtype)


	# --- UPDATED: Group all events that contain pre-computed text/image indices ---
	elif event_type in ['XPost', 'XReply', 'XRetweet', 'XQuoteTweet', 'PumpReply', 'DexProfile_Updated', 'TikTok_Trending_Hashtag', 'XTrending_Hashtag']:
	# Store raw event data to look up text/image indices later
	# 1. Store raw text/media data
	textual_event_data_list.append(event)
	textual_event_indices[i, j] = len(textual_event_data_list) # 1-based index
	# --- FIXED: Handle rank for trending hashtags ---
	if event_type in ['TikTok_Trending_Hashtag', 'XTrending_Hashtag']:
	global_trending_numerical_features[i, j, 0] = 1.0 / event.get('rank', 1e9)

	# 2. Populate wallet pointer tensors based on the event type
	# The main 'wallet_address' is already handled above.
	# Here we handle the other wallets involved.
	if event_type == 'XRetweet' or event_type == 'XQuoteTweet':
	orig_author_addr = event.get('original_author_wallet_address')
	if orig_author_addr:
	# --- FIXED: Use the dedicated tensor for original authors ---
	original_author_indices[i, j] = wallet_addr_to_batch_idx.get(orig_author_addr, PAD_IDX_ENT)

	# The pre-computed embedding indices are already in the event dict.
	# No need to populate image_indices here anymore.
	# For XReply, the main tweet is a text/media embedding, not a wallet.
	# For XPost, there's only one wallet, already handled.

	# --- 4. Collate Dynamic Features (OHLC) AFTER collecting them ---
	ohlc_inputs_dict = self._collate_ohlc_inputs(batch_chart_events)

	# --- 6. Prepare final output dictionary ---
	collated_batch = {
	# Sequence Tensors
	'event_type_ids': event_type_ids,
	'timestamps_float': timestamps_float,
	'relative_ts': relative_ts,
	'attention_mask': attention_mask,
	# Pointer Tensors
	'wallet_indices': wallet_indices,
	'token_indices': token_indices,
	'quote_token_indices': quote_token_indices, # NEW
	'trending_token_indices': trending_token_indices, # NEW
	'boosted_token_indices': boosted_token_indices, # NEW
	'holder_snapshot_indices': holder_snapshot_indices, # This now points to the generated embeddings
	'textual_event_indices': textual_event_indices, # RENAMED
	'ohlc_indices': ohlc_indices,
	# Raw Data for Encoders
	'embedding_pool': batch_embedding_pool, # NEW
	'token_encoder_inputs': token_encoder_inputs,
	'wallet_encoder_inputs': wallet_encoder_inputs, # ADDED BACK
	'ohlc_price_tensors': ohlc_inputs_dict['price_tensor'],
	'ohlc_interval_ids': ohlc_inputs_dict['interval_ids'],
	'quant_ohlc_feature_tensors': ohlc_inputs_dict['quant_feature_tensors'],
	'quant_ohlc_feature_mask': ohlc_inputs_dict['quant_feature_mask'],
	'quant_ohlc_feature_version_ids': ohlc_inputs_dict['quant_feature_version_ids'],
	'graph_updater_links': graph_updater_links,
	'wallet_addr_to_batch_idx': wallet_addr_to_batch_idx, # NEW: Pass the mapping

	'dest_wallet_indices': dest_wallet_indices, # ADDED THIS LINE
	'original_author_indices': original_author_indices, # NEW
	# --- NEW: Numerical Features for Events ---
	'transfer_numerical_features': transfer_numerical_features,
	'trade_numerical_features': trade_numerical_features,
	'trade_dex_ids': trade_dex_ids,
	'deployer_trade_numerical_features': deployer_trade_numerical_features,
	'trade_direction_ids': trade_direction_ids, # NEW
	'trade_mev_protection_ids': trade_mev_protection_ids, # NEW
	'smart_wallet_trade_numerical_features': smart_wallet_trade_numerical_features,
	'trade_is_bundle_ids': trade_is_bundle_ids, # NEW
	'pool_created_numerical_features': pool_created_numerical_features,
	'pool_created_protocol_ids': pool_created_protocol_ids, # NEW
	'liquidity_change_numerical_features': liquidity_change_numerical_features,
	'liquidity_change_type_ids': liquidity_change_type_ids, # NEW
	'fee_collected_numerical_features': fee_collected_numerical_features, # NEW
	'token_burn_numerical_features': token_burn_numerical_features, # NEW
	'supply_lock_numerical_features': supply_lock_numerical_features, # NEW
	'onchain_snapshot_numerical_features': onchain_snapshot_numerical_features, # NEW
	'boosted_token_numerical_features': boosted_token_numerical_features,
	'trending_token_numerical_features': trending_token_numerical_features,
	'trending_token_source_ids': trending_token_source_ids, # NEW
	'trending_token_timeframe_ids': trending_token_timeframe_ids, # NEW
	'dexboost_paid_numerical_features': dexboost_paid_numerical_features, # NEW
	'dexprofile_updated_flags': dexprofile_updated_flags, # NEW,
	'global_trending_numerical_features': global_trending_numerical_features, # NEW
	'chainsnapshot_numerical_features': chainsnapshot_numerical_features, # NEW
	'lighthousesnapshot_numerical_features': lighthousesnapshot_numerical_features,
	'lighthousesnapshot_protocol_ids': lighthousesnapshot_protocol_ids, # NEW
	'lighthousesnapshot_timeframe_ids': lighthousesnapshot_timeframe_ids, # NEW
	'migrated_protocol_ids': migrated_protocol_ids, # NEW
	'alpha_group_ids': alpha_group_ids, # NEW
	'channel_ids': channel_ids, # NEW
	'exchange_ids': exchange_ids, # NEW
	'holder_snapshot_raw_data': holder_snapshot_raw_data_list, # NEW: Raw data for end-to-end processing
	'textual_event_data': textual_event_data_list, # RENAMED
	# Labels
	'labels': torch.stack([item['labels'] for item in batch]) if batch and 'labels' in batch[0] else None,
	'labels_mask': torch.stack([item['labels_mask'] for item in batch]) if batch and 'labels_mask' in batch[0] else None,
	'movement_class_targets': torch.stack([item['movement_class_targets'] for item in batch]) if batch and 'movement_class_targets' in batch[0] else None,
	'movement_class_mask': torch.stack([item['movement_class_mask'] for item in batch]) if batch and 'movement_class_mask' in batch[0] else None,
	'quality_score': torch.stack([item['quality_score'] if isinstance(item['quality_score'], torch.Tensor) else torch.tensor(item['quality_score'], dtype=torch.float32) for item in batch]) if batch and 'quality_score' in batch[0] else None,
	'class_id': torch.tensor([item.get('class_id', 0) for item in batch], dtype=torch.long),
	# Debug info
	'token_addresses': [item.get('token_address', 'unknown') for item in batch],
	't_cutoffs': [item.get('t_cutoff', 'unknown') for item in batch],
	'sample_indices': [item.get('sample_idx', -1) for item in batch]
	}

	if collated_batch['quality_score'] is None:
	raise RuntimeError("FATAL: Missing quality_score in batch items. Rebuild cache with quality_score enabled.")

	# Filter out None values (e.g., if no labels provided)
	return {k: v for k, v in collated_batch.items() if v is not None}