Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.ipynb_checkpoints/.gitignore-checkpoint +19 -0
data/.ipynb_checkpoints/data_fetcher-checkpoint.py +1263 -0
data/data_collator.py +13 -4
data/ohlc_stats.npz +1 -1
log.log +2 -2
models/graph_updater.py +5 -6
sample_12LJX4a83B4tCuZ1_3.json +0 -0
scripts/.ipynb_checkpoints/cache_dataset-checkpoint.py +431 -0
scripts/analyze_distribution.py +100 -0
scripts/dump_cache_sample.py +146 -0
train.py +2 -2
train.sh +4 -4

.ipynb_checkpoints/.gitignore-checkpoint ADDED Viewed

	@@ -0,0 +1,19 @@

+# Ignore the __pycache__ directory anywhere in the repository
+__pycache__/
+# Ignore the 'runs' directory anywhere in the repository, regardless of nesting
+runs/
+data/pump_fun
+data/cache
+.env
+data/cache
+.tmp/
+.cache/
+checkpoints/
+metadata/
+store/
+preprocessed_configs/
+.early.coverage

data/.ipynb_checkpoints/data_fetcher-checkpoint.py ADDED Viewed

	@@ -0,0 +1,1263 @@

+# data_fetcher.py
+from typing import List, Dict, Any, Tuple, Set, Optional
+from collections import defaultdict
+import datetime, time
+# We need the vocabulary for mapping IDs
+import models.vocabulary as vocab
+class DataFetcher:
+    """
+    A dedicated class to handle all database queries for ClickHouse and Neo4j.
+    This keeps data fetching logic separate from the dataset and model.
+    """
+    # --- Explicit column definitions for wallet profile & social fetches ---
+    PROFILE_BASE_COLUMNS = [
+        'wallet_address',
+        'updated_at',
+        'first_seen_ts',
+        'last_seen_ts',
+        'tags',
+        'deployed_tokens',
+        'funded_from',
+        'funded_timestamp',
+        'funded_signature',
+        'funded_amount'
+    ]
+    PROFILE_METRIC_COLUMNS = [
+        'balance',
+        'transfers_in_count',
+        'transfers_out_count',
+        'spl_transfers_in_count',
+        'spl_transfers_out_count',
+        'total_buys_count',
+        'total_sells_count',
+        'total_winrate',
+        'stats_1d_realized_profit_sol',
+        'stats_1d_realized_profit_usd',
+        'stats_1d_realized_profit_pnl',
+        'stats_1d_buy_count',
+        'stats_1d_sell_count',
+        'stats_1d_transfer_in_count',
+        'stats_1d_transfer_out_count',
+        'stats_1d_avg_holding_period',
+        'stats_1d_total_bought_cost_sol',
+        'stats_1d_total_bought_cost_usd',
+        'stats_1d_total_sold_income_sol',
+        'stats_1d_total_sold_income_usd',
+        'stats_1d_total_fee',
+        'stats_1d_winrate',
+        'stats_1d_tokens_traded',
+        'stats_7d_realized_profit_sol',
+        'stats_7d_realized_profit_usd',
+        'stats_7d_realized_profit_pnl',
+        'stats_7d_buy_count',
+        'stats_7d_sell_count',
+        'stats_7d_transfer_in_count',
+        'stats_7d_transfer_out_count',
+        'stats_7d_avg_holding_period',
+        'stats_7d_total_bought_cost_sol',
+        'stats_7d_total_bought_cost_usd',
+        'stats_7d_total_sold_income_sol',
+        'stats_7d_total_sold_income_usd',
+        'stats_7d_total_fee',
+        'stats_7d_winrate',
+        'stats_7d_tokens_traded',
+        'stats_30d_realized_profit_sol',
+        'stats_30d_realized_profit_usd',
+        'stats_30d_realized_profit_pnl',
+        'stats_30d_buy_count',
+        'stats_30d_sell_count',
+        'stats_30d_transfer_in_count',
+        'stats_30d_transfer_out_count',
+        'stats_30d_avg_holding_period',
+        'stats_30d_total_bought_cost_sol',
+        'stats_30d_total_bought_cost_usd',
+        'stats_30d_total_sold_income_sol',
+        'stats_30d_total_sold_income_usd',
+        'stats_30d_total_fee',
+        'stats_30d_winrate',
+        'stats_30d_tokens_traded'
+    ]
+    PROFILE_COLUMNS_FOR_QUERY = PROFILE_BASE_COLUMNS + PROFILE_METRIC_COLUMNS
+    SOCIAL_COLUMNS_FOR_QUERY = [
+        'wallet_address',
+        'pumpfun_username',
+        'twitter_username',
+        'telegram_channel',
+        'kolscan_name',
+        'cabalspy_name',
+        'axiom_kol_name'
+    ]
+    def __init__(self, clickhouse_client: Any, neo4j_driver: Any):
+        self.db_client = clickhouse_client
+        self.graph_client = neo4j_driver
+        print("DataFetcher instantiated.")
+    def get_all_mints(self, start_date: Optional[datetime.datetime] = None) -> List[Dict[str, Any]]:
+        """
+        Fetches a list of all mint events to serve as dataset samples.
+        Can be filtered to only include mints on or after a given start_date.
+        """
+        query = "SELECT mint_address, timestamp, creator_address, protocol, token_name, token_symbol, token_uri, total_supply, token_decimals FROM mints"
+        params = {}
+        where_clauses = []
+        if start_date:
+            where_clauses.append("timestamp >= %(start_date)s")
+            params['start_date'] = start_date
+        if where_clauses:
+            query += " WHERE " + " AND ".join(where_clauses)
+        print(f"INFO: Executing query to get all mints: `{query}` with params: {params}")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            result = [dict(zip(columns, row)) for row in rows]
+            if not result:
+                return []
+            return result
+        except Exception as e:
+            print(f"ERROR: Failed to fetch token addresses from ClickHouse: {e}")
+            print("INFO: Falling back to mock token addresses for development.")
+            return [{'mint_address': 'tknA_real', 'timestamp': datetime.datetime.now(datetime.timezone.utc), 'creator_address': 'addr_Creator_Real', 'protocol': 0}]
+    def fetch_mint_record(self, token_address: str) -> Dict[str, Any]:
+        """
+        Fetches the raw mint record for a token from the 'mints' table.
+        """
+        query = f"SELECT timestamp, creator_address, mint_address, protocol FROM mints WHERE mint_address = '{token_address}' ORDER BY timestamp ASC LIMIT 1"
+        print(f"INFO: Executing query to fetch mint record: `{query}`")
+        # Assumes the client returns a list of dicts or can be converted
+        # Using column names from your schema
+        columns = ['timestamp', 'creator_address', 'mint_address', 'protocol']
+        try:
+            result = self.db_client.execute(query)
+            if not result or not result[0]:
+                 raise ValueError(f"No mint event found for token {token_address}")
+            # Convert the tuple result into a dictionary
+            record = dict(zip(columns, result[0]))
+            return record
+        except Exception as e:
+            print(f"ERROR: Failed to fetch mint record for {token_address}: {e}")
+            print("INFO: Falling back to mock mint record for development.")
+            # Fallback for development if DB connection fails
+            return {
+                'timestamp': datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=1),
+                'creator_address': 'addr_Creator_Real',
+                'mint_address': token_address,
+                'protocol': vocab.PROTOCOL_TO_ID.get("Pump V1", 0)
+            }
+    def fetch_wallet_profiles(self, wallet_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, Dict[str, Any]]:
+        """
+        Convenience wrapper around fetch_wallet_profiles_and_socials for profile-only data.
+        """
+        profiles, _ = self.fetch_wallet_profiles_and_socials(wallet_addresses, T_cutoff)
+        return profiles
+    def fetch_wallet_socials(self, wallet_addresses: List[str]) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches wallet social records for a list of wallet addresses.
+        Batches queries to avoid "Max query size exceeded" errors.
+        Returns a dictionary mapping wallet_address to its social data.
+        """
+        if not wallet_addresses:
+            return {}
+        BATCH_SIZE = 1000
+        socials = {}
+        total_wallets = len(wallet_addresses)
+        print(f"INFO: Executing query to fetch wallet socials for {total_wallets} wallets in batches of {BATCH_SIZE}.")
+        for i in range(0, total_wallets, BATCH_SIZE):
+            batch_addresses = wallet_addresses[i : i + BATCH_SIZE]
+            query = "SELECT * FROM wallet_socials WHERE wallet_address IN %(addresses)s"
+            params = {'addresses': batch_addresses}
+            try:
+                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+                if not rows:
+                    continue
+                columns = [col[0] for col in columns_info]
+                for row in rows:
+                    social_dict = dict(zip(columns, row))
+                    wallet_addr = social_dict.get('wallet_address')
+                    if wallet_addr:
+                        socials[wallet_addr] = social_dict
+            except Exception as e:
+                print(f"ERROR: Failed to fetch wallet socials for batch {i}: {e}")
+                # Continue to next batch
+        return socials
+    def fetch_wallet_profiles_and_socials(self,
+                                          wallet_addresses: List[str],
+                                          T_cutoff: datetime.datetime) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Dict[str, Any]]]:
+        """
+        Fetches wallet profiles (time-aware) and socials for all requested wallets.
+        Batches queries to avoid "Max query size exceeded" errors.
+        Returns two dictionaries: profiles, socials.
+        """
+        if not wallet_addresses:
+            return {}, {}
+        social_columns = self.SOCIAL_COLUMNS_FOR_QUERY
+        profile_base_cols = self.PROFILE_BASE_COLUMNS
+        profile_metric_cols = self.PROFILE_METRIC_COLUMNS
+        profile_base_str = ",\n                ".join(profile_base_cols)
+        metric_projection_cols = ['wallet_address', 'updated_at'] + profile_metric_cols
+        profile_metric_str = ",\n                ".join(metric_projection_cols)
+        profile_base_select_cols = [col for col in profile_base_cols if col != 'wallet_address']
+        profile_metric_select_cols = [
+            col for col in profile_metric_cols if col not in ('wallet_address',)
+        ]
+        social_select_cols = [col for col in social_columns if col != 'wallet_address']
+        select_expressions = []
+        for col in profile_base_select_cols:
+            select_expressions.append(f"lp.{col} AS profile__{col}")
+        for col in profile_metric_select_cols:
+            select_expressions.append(f"lm.{col} AS profile__{col}")
+        for col in social_select_cols:
+            select_expressions.append(f"ws.{col} AS social__{col}")
+        select_clause = ""
+        if select_expressions:
+            select_clause = ",\n            " + ",\n            ".join(select_expressions)
+        profile_keys = [f"profile__{col}" for col in (profile_base_select_cols + profile_metric_select_cols)]
+        social_keys = [f"social__{col}" for col in social_select_cols]
+        BATCH_SIZE = 1000
+        all_profiles = {}
+        all_socials = {}
+        total_wallets = len(wallet_addresses)
+        print(f"INFO: Fetching profiles+socials for {total_wallets} wallets in batches of {BATCH_SIZE}...")
+        for i in range(0, total_wallets, BATCH_SIZE):
+            batch_addresses = wallet_addresses[i : i + BATCH_SIZE]
+            query = f"""
+            WITH ranked_profiles AS (
+                SELECT
+                    {profile_base_str},
+                    ROW_NUMBER() OVER (PARTITION BY wallet_address ORDER BY updated_at DESC) AS rn
+                FROM wallet_profiles
+                WHERE wallet_address IN %(addresses)s
+            ),
+            latest_profiles AS (
+                SELECT
+                    {profile_base_str}
+                FROM ranked_profiles
+                WHERE rn = 1
+            ),
+            ranked_metrics AS (
+                SELECT
+                    {profile_metric_str},
+                    ROW_NUMBER() OVER (PARTITION BY wallet_address ORDER BY updated_at DESC) AS rn
+                FROM wallet_profile_metrics
+                WHERE
+                    wallet_address IN %(addresses)s
+                    AND updated_at <= %(T_cutoff)s
+            ),
+            latest_metrics AS (
+                SELECT
+                    {profile_metric_str}
+                FROM ranked_metrics
+                WHERE rn = 1
+            ),
+            requested_wallets AS (
+                SELECT DISTINCT wallet_address
+                FROM (SELECT arrayJoin(%(addresses)s) AS wallet_address)
+            )
+            SELECT
+                rw.wallet_address AS wallet_address
+                {select_clause}
+            FROM requested_wallets AS rw
+            LEFT JOIN latest_profiles AS lp ON rw.wallet_address = lp.wallet_address
+            LEFT JOIN latest_metrics AS lm ON rw.wallet_address = lm.wallet_address
+            LEFT JOIN wallet_socials AS ws ON rw.wallet_address = ws.wallet_address;
+            """
+            params = {'addresses': batch_addresses, 'T_cutoff': T_cutoff}
+            try:
+                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+                if not rows:
+                    continue
+                columns = [col[0] for col in columns_info]
+                for row in rows:
+                    row_dict = dict(zip(columns, row))
+                    wallet_addr = row_dict.get('wallet_address')
+                    if not wallet_addr:
+                        continue
+                    profile_data = {}
+                    if profile_keys:
+                        for pref_key in profile_keys:
+                            if pref_key in row_dict:
+                                value = row_dict[pref_key]
+                                profile_data[pref_key.replace('profile__', '')] = value
+                    if profile_data and any(value is not None for value in profile_data.values()):
+                        profile_data['wallet_address'] = wallet_addr
+                        all_profiles[wallet_addr] = profile_data
+                    social_data = {}
+                    if social_keys:
+                        for pref_key in social_keys:
+                            if pref_key in row_dict:
+                                value = row_dict[pref_key]
+                                social_data[pref_key.replace('social__', '')] = value
+                    if social_data and any(value is not None for value in social_data.values()):
+                        social_data['wallet_address'] = wallet_addr
+                        all_socials[wallet_addr] = social_data
+            except Exception as e:
+                print(f"ERROR: Combined profile/social query failed for batch {i}-{i+BATCH_SIZE}: {e}")
+                # We continue to the next batch
+        return all_profiles, all_socials
+    def fetch_wallet_holdings(self, wallet_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Fetches top 2 wallet holding records for a list of wallet addresses that were active at T_cutoff.
+        Batches queries to avoid "Max query size exceeded" errors.
+        Returns a dictionary mapping wallet_address to a LIST of its holding data.
+        """
+        if not wallet_addresses:
+            return {}
+        BATCH_SIZE = 1000
+        holdings = defaultdict(list)
+        total_wallets = len(wallet_addresses)
+        print(f"INFO: Executing query to fetch wallet holdings for {total_wallets} wallets in batches of {BATCH_SIZE}.")
+        for i in range(0, total_wallets, BATCH_SIZE):
+            batch_addresses = wallet_addresses[i : i + BATCH_SIZE]
+            # --- Time-aware query ---
+            # 1. For each holding, find the latest state at or before T_cutoff.
+            # 2. Filter for holdings where the balance was greater than 0.
+            # 3. Rank these active holdings by USD volume and take the top 2 per wallet.
+            query = """
+            WITH point_in_time_holdings AS (
+                SELECT
+                    *,
+                    COALESCE(history_bought_cost_sol, 0) + COALESCE(history_sold_income_sol, 0) AS total_volume_usd,
+                    ROW_NUMBER() OVER(PARTITION BY wallet_address, mint_address ORDER BY updated_at DESC) as rn_per_holding
+                FROM wallet_holdings
+                WHERE
+                    wallet_address IN %(addresses)s
+                    AND updated_at <= %(T_cutoff)s
+            ),
+            ranked_active_holdings AS (
+                SELECT *,
+                       ROW_NUMBER() OVER(PARTITION BY wallet_address ORDER BY total_volume_usd DESC) as rn_per_wallet
+                FROM point_in_time_holdings
+                WHERE rn_per_holding = 1 AND current_balance > 0
+            )
+            SELECT *
+            FROM ranked_active_holdings
+            WHERE rn_per_wallet <= 2;
+            """
+            params = {'addresses': batch_addresses, 'T_cutoff': T_cutoff}
+            try:
+                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+                if not rows:
+                    continue
+                columns = [col[0] for col in columns_info]
+                for row in rows:
+                    holding_dict = dict(zip(columns, row))
+                    wallet_addr = holding_dict.get('wallet_address')
+                    if wallet_addr:
+                        holdings[wallet_addr].append(holding_dict)
+            except Exception as e:
+                print(f"ERROR: Failed to fetch wallet holdings for batch {i}: {e}")
+                # Continue to next batch
+        return dict(holdings)
+    def fetch_graph_links(self,
+                          initial_addresses: List[str],
+                          T_cutoff: datetime.datetime,
+                          max_degrees: int = 1) -> Tuple[Dict[str, str], Dict[str, Dict[str, Any]]]:
+        """
+        Fetches graph links from Neo4j, traversing up to a max degree of separation.
+        Args:
+            initial_addresses: A list of starting wallet or token addresses.
+            max_degrees: The maximum number of hops to traverse in the graph.
+        Returns:
+            A tuple containing:
+            - A dictionary mapping entity addresses to their type ('Wallet' or 'Token').
+            - A dictionary of aggregated links, structured for the GraphUpdater.
+        """
+        if not initial_addresses:
+            return {}, {}
+        cutoff_ts = int(T_cutoff.timestamp())
+        print(f"INFO: Fetching graph links up to {max_degrees} degrees for {len(initial_addresses)} initial entities...")
+        max_retries = 3
+        backoff_sec = 2
+        for attempt in range(max_retries + 1):
+            try:
+                with self.graph_client.session() as session:
+                    all_entities = {addr: 'Token' for addr in initial_addresses} # Assume initial are tokens
+                    newly_found_entities = set(initial_addresses)
+                    aggregated_links = defaultdict(lambda: {'links': [], 'edges': []})
+                    for i in range(max_degrees):
+                        if not newly_found_entities:
+                            break
+                        print(f"  - Degree {i+1}: Traversing from {len(newly_found_entities)} new entities...")
+                        # --- TIMING: Query execution ---
+                        _t_query_start = time.perf_counter()
+                        # Cypher query to find direct neighbors of the current frontier
+                        # OPTIMIZED: Filter by timestamp IN Neo4j to avoid transferring 97%+ unused records
+                        query = """
+                        MATCH (a)-[r]-(b)
+                        WHERE a.address IN $addresses AND r.timestamp <= $cutoff_ts
+                        RETURN a.address AS source_address, type(r) AS link_type, properties(r) AS link_props, b.address AS dest_address, labels(b)[0] AS dest_type
+                        LIMIT 10000
+                        """
+                        params = {'addresses': list(newly_found_entities), 'cutoff_ts': cutoff_ts}
+                        result = session.run(query, params)
+                        _t_query_done = time.perf_counter()
+                        # --- TIMING: Result processing ---
+                        _t_process_start = time.perf_counter()
+                        records_total = 0
+                        current_degree_new_entities = set()
+                        for record in result:
+                            records_total += 1
+                            link_type = record['link_type']
+                            link_props = dict(record['link_props'])
+                            source_addr = record['source_address']
+                            dest_addr = record['dest_address']
+                            dest_type = record['dest_type']
+                            # Add the link and edge data
+                            aggregated_links[link_type]['links'].append(link_props)
+                            aggregated_links[link_type]['edges'].append((source_addr, dest_addr))
+                            # If we found a new entity, add it to the set for the next iteration
+                            if dest_addr not in all_entities.keys():
+                                current_degree_new_entities.add(dest_addr)
+                                all_entities[dest_addr] = dest_type
+                        _t_process_done = time.perf_counter()
+                        # --- TIMING: Print detailed stats ---
+                        print(f"    [NEO4J TIMING] query_exec: {(_t_query_done - _t_query_start)*1000:.1f}ms, "
+                              f"result_process: {(_t_process_done - _t_process_start)*1000:.1f}ms")
+                        print(f"    [NEO4J STATS] records_returned: {records_total}, "
+                              f"new_entities: {len(current_degree_new_entities)}")
+                        newly_found_entities = current_degree_new_entities
+                    # --- Post-process: rename, map props, strip, cap ---
+                    MAX_LINKS_PER_TYPE = 500
+                    # Neo4j type -> collator type name
+                    _NEO4J_TO_COLLATOR_NAME = {
+                        'TRANSFERRED_TO': 'TransferLink',
+                        'BUNDLE_TRADE': 'BundleTradeLink',
+                        'COPIED_TRADE': 'CopiedTradeLink',
+                        'COORDINATED_ACTIVITY': 'CoordinatedActivityLink',
+                        'SNIPED': 'SnipedLink',
+                        'MINTED': 'MintedLink',
+                        'LOCKED_SUPPLY': 'LockedSupplyLink',
+                        'BURNED': 'BurnedLink',
+                        'PROVIDED_LIQUIDITY': 'ProvidedLiquidityLink',
+                        'WHALE_OF': 'WhaleOfLink',
+                        'TOP_TRADER_OF': 'TopTraderOfLink',
+                    }
+                    # Neo4j prop name -> encoder prop name (for fields with mismatched names)
+                    _PROP_REMAP = {
+                        'CopiedTradeLink': {
+                            'buy_gap': 'time_gap_on_buy_sec',
+                            'sell_gap': 'time_gap_on_sell_sec',
+                            'f_buy_total': 'follower_buy_total',
+                            'f_sell_total': 'follower_sell_total',
+                            'leader_pnl': 'leader_pnl',
+                            'follower_pnl': 'follower_pnl',
+                        },
+                    }
+                    # Only keep fields each encoder actually reads
+                    _NEEDED_FIELDS = {
+                        'TransferLink': ['amount', 'mint'],
+                        'BundleTradeLink': ['signatures'],  # Neo4j has no total_amount; we derive it below
+                        'CopiedTradeLink': ['time_gap_on_buy_sec', 'time_gap_on_sell_sec', 'leader_pnl', 'follower_pnl', 'follower_buy_total', 'follower_sell_total'],
+                        'CoordinatedActivityLink': ['time_gap_on_first_sec', 'time_gap_on_second_sec'],
+                        'SnipedLink': ['rank', 'sniped_amount'],
+                        'MintedLink': ['buy_amount'],
+                        'LockedSupplyLink': ['amount'],
+                        'BurnedLink': ['amount'],
+                        'ProvidedLiquidityLink': ['amount_quote'],
+                        'WhaleOfLink': ['holding_pct_at_creation'],
+                        'TopTraderOfLink': ['pnl_at_creation'],
+                    }
+                    cleaned_links = {}
+                    for neo4j_type, data in aggregated_links.items():
+                        collator_name = _NEO4J_TO_COLLATOR_NAME.get(neo4j_type)
+                        if not collator_name:
+                            continue  # Skip unknown link types
+                        links = data['links']
+                        edges = data['edges']
+                        # Cap
+                        links = links[:MAX_LINKS_PER_TYPE]
+                        edges = edges[:MAX_LINKS_PER_TYPE]
+                        # Remap property names if needed
+                        remap = _PROP_REMAP.get(collator_name)
+                        if remap:
+                            links = [{remap.get(k, k): v for k, v in l.items()} for l in links]
+                        # Strip to only needed fields
+                        needed = _NEEDED_FIELDS.get(collator_name, [])
+                        links = [{f: l.get(f, 0) for f in needed} for l in links]
+                        # BundleTradeLink: Neo4j has no total_amount; derive from signatures count
+                        if collator_name == 'BundleTradeLink':
+                            links = [{'total_amount': len(l.get('signatures', []) if isinstance(l.get('signatures'), list) else [])} for l in links]
+                        cleaned_links[collator_name] = {'links': links, 'edges': edges}
+                    return all_entities, cleaned_links
+            except Exception as e:
+                msg = str(e)
+                is_rate_limit = "AuthenticationRateLimit" in msg or "RateLimit" in msg
+                is_transient = "ServiceUnavailable" in msg or "TransientError" in msg or "SessionExpired" in msg
+                if is_rate_limit or is_transient:
+                    if attempt < max_retries:
+                        sleep_time = backoff_sec * (2 ** attempt)
+                        print(f"WARN: Neo4j error ({type(e).__name__}). Retrying in {sleep_time}s... (Attempt {attempt+1}/{max_retries})")
+                        time.sleep(sleep_time)
+                        continue
+                # If we're here, it's either not retryable or we ran out of retries
+                # Ensure we use "FATAL" prefix so the caller knows to stop if required
+                raise RuntimeError(f"FATAL: Failed to fetch graph links from Neo4j: {e}") from e
+    def fetch_token_data(self, token_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches the latest token data for each address at or before T_cutoff.
+        Batches queries to avoid "Max query size exceeded" errors.
+        Returns a dictionary mapping token_address to its data.
+        """
+        if not token_addresses:
+            return {}
+        BATCH_SIZE = 1000
+        tokens = {}
+        total_tokens = len(token_addresses)
+        print(f"INFO: Executing query to fetch token data for {total_tokens} tokens in batches of {BATCH_SIZE}.")
+        for i in range(0, total_tokens, BATCH_SIZE):
+            batch_addresses = token_addresses[i : i + BATCH_SIZE]
+            # --- NEW: Time-aware query for historical token data ---
+            query = """
+            WITH ranked_tokens AS (
+                SELECT
+                    *,
+                    ROW_NUMBER() OVER (PARTITION BY token_address ORDER BY updated_at DESC) as rn
+                FROM tokens
+                WHERE
+                    token_address IN %(addresses)s
+                    AND updated_at <= %(T_cutoff)s
+            )
+            SELECT token_address, name, symbol, token_uri, protocol, total_supply, decimals
+            FROM ranked_tokens
+            WHERE rn = 1;
+            """
+            params = {'addresses': batch_addresses, 'T_cutoff': T_cutoff}
+            try:
+                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+                if not rows:
+                    continue
+                # Get column names from the query result description
+                columns = [col[0] for col in columns_info]
+                for row in rows:
+                    token_dict = dict(zip(columns, row))
+                    token_addr = token_dict.get('token_address')
+                    if token_addr:
+                        # The 'tokens' table in the schema has 'token_address' but the
+                        # collator expects 'address'. We'll add it for compatibility.
+                        token_dict['address'] = token_addr
+                        tokens[token_addr] = token_dict
+            except Exception as e:
+                print(f"ERROR: Failed to fetch token data for batch {i}: {e}")
+                # Continue next batch
+        return tokens
+    def fetch_deployed_token_details(self, token_addresses: List[str], T_cutoff: datetime.datetime) -> Dict[str, Dict[str, Any]]:
+        """
+        Fetches historical details for deployed tokens at or before T_cutoff.
+        Batches queries to avoid "Max query size exceeded" errors.
+        """
+        if not token_addresses:
+            return {}
+        BATCH_SIZE = 1000
+        token_details = {}
+        total_tokens = len(token_addresses)
+        print(f"INFO: Executing query to fetch deployed token details for {total_tokens} tokens in batches of {BATCH_SIZE}.")
+        for i in range(0, total_tokens, BATCH_SIZE):
+            batch_addresses = token_addresses[i : i + BATCH_SIZE]
+            # --- NEW: Time-aware query for historical deployed token details ---
+            query = """
+            WITH ranked_tokens AS (
+                SELECT
+                    *,
+                    ROW_NUMBER() OVER (PARTITION BY token_address ORDER BY updated_at DESC) as rn
+                FROM tokens
+                WHERE
+                    token_address IN %(addresses)s
+                    AND updated_at <= %(T_cutoff)s
+            ),
+            ranked_token_metrics AS (
+                SELECT
+                    token_address,
+                    ath_price_usd,
+                    ROW_NUMBER() OVER (PARTITION BY token_address ORDER BY updated_at DESC) as rn
+                FROM token_metrics
+                WHERE
+                    token_address IN %(addresses)s
+                    AND updated_at <= %(T_cutoff)s
+            ),
+            latest_tokens AS (
+                SELECT *
+                FROM ranked_tokens
+                WHERE rn = 1
+            ),
+            latest_token_metrics AS (
+                SELECT *
+                FROM ranked_token_metrics
+                WHERE rn = 1
+            )
+            SELECT
+                lt.token_address,
+                lt.created_at,
+                lt.updated_at,
+                ltm.ath_price_usd,
+                lt.total_supply,
+                lt.decimals,
+                (lt.launchpad != lt.protocol) AS has_migrated
+            FROM latest_tokens AS lt
+            LEFT JOIN latest_token_metrics AS ltm
+                ON lt.token_address = ltm.token_address;
+            """
+            params = {'addresses': batch_addresses, 'T_cutoff': T_cutoff}
+            try:
+                rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+                if not rows:
+                    continue
+                columns = [col[0] for col in columns_info]
+                for row in rows:
+                    token_details[row[0]] = dict(zip(columns, row))
+            except Exception as e:
+                print(f"ERROR: Failed to fetch deployed token details for batch {i}: {e}")
+                # Continue next batch
+        return token_details
+    def fetch_trades_for_token(self, token_address: str, T_cutoff: datetime.datetime, count_threshold: int, early_limit: int, recent_limit: int, full_history: bool = False) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Fetches ALL trades for a token up to T_cutoff, ordered by time.
+        Notes:
+        - This intentionally does NOT apply the older fetch-time H/B/H (High-Def / Blurry / High-Def)
+          sampling logic. Sequence-length control is handled later in data_loader.py via event-level
+          head/tail sampling with MIDDLE/RECENT markers.
+        - The function signature still includes legacy H/B/H parameters for compatibility.
+        Returns: (all_trades, [], [])
+        """
+        if not token_address:
+            return [], [], []
+        params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        query = "SELECT * FROM trades WHERE base_address = %(token_address)s AND timestamp <= %(T_cutoff)s ORDER BY timestamp ASC"
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return [], [], []
+            columns = [col[0] for col in columns_info]
+            all_trades = [dict(zip(columns, row)) for row in rows]
+            return all_trades, [], []
+        except Exception as e:
+            print(f"ERROR: Failed to fetch trades for token {token_address}: {e}")
+            return [], [], []
+    def fetch_future_trades_for_token(self,
+                                      token_address: str,
+                                      start_ts: datetime.datetime,
+                                      end_ts: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches successful trades for a token in the window (start_ts, end_ts].
+        Used for constructing label targets beyond the cutoff.
+        """
+        if not token_address or start_ts is None or end_ts is None or start_ts >= end_ts:
+            return []
+        query = """
+        SELECT *
+        FROM trades
+        WHERE base_address = %(token_address)s
+          AND success = true
+          AND timestamp > %(start_ts)s
+          AND timestamp <= %(end_ts)s
+        ORDER BY timestamp ASC
+        """
+        params = {
+            'token_address': token_address,
+            'start_ts': start_ts,
+            'end_ts': end_ts
+        }
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch future trades for token {token_address}: {e}")
+            return []
+    def fetch_transfers_for_token(self, token_address: str, T_cutoff: datetime.datetime, min_amount_threshold: float = 10_000_000) -> List[Dict[str, Any]]:
+        """
+        Fetches all transfers for a token before T_cutoff, filtering out small amounts.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT * FROM transfers
+        WHERE mint_address = %(token_address)s
+          AND timestamp <= %(T_cutoff)s
+          AND amount_decimal >= %(min_amount)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token_address': token_address, 'T_cutoff': T_cutoff, 'min_amount': min_amount_threshold}
+        print(f"INFO: Fetching significant transfers for {token_address} (amount >= {min_amount_threshold}).")
+        try:
+            # This query no longer uses H/B/H, it fetches all significant transfers
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows: return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch transfers for token {token_address}: {e}")
+            return []
+    def fetch_pool_creations_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches pool creation records where the token is the base asset.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            signature,
+            timestamp,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            creator_address,
+            pool_address,
+            base_address,
+            quote_address,
+            lp_token_address,
+            initial_base_liquidity,
+            initial_quote_liquidity,
+            base_decimals,
+            quote_decimals
+        FROM pool_creations
+        WHERE base_address = %(token_address)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token_address': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching pool creation events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch pool creations for token {token_address}: {e}")
+            return []
+    def fetch_liquidity_changes_for_pools(self, pool_addresses: List[str], T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches liquidity change records for the given pools up to T_cutoff.
+        """
+        if not pool_addresses:
+            return []
+        query = """
+        SELECT
+            signature,
+            timestamp,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            change_type,
+            lp_provider,
+            pool_address,
+            base_amount,
+            quote_amount
+        FROM liquidity
+        WHERE pool_address IN %(pool_addresses)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'pool_addresses': pool_addresses, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching liquidity change events for {len(pool_addresses)} pool(s).")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch liquidity changes for pools {pool_addresses}: {e}")
+            return []
+    def fetch_fee_collections_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches fee collection events where the token appears as either token_0 or token_1.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            recipient_address,
+            token_0_mint_address,
+            token_0_amount,
+            token_1_mint_address,
+            token_1_amount
+        FROM fee_collections
+        WHERE (token_0_mint_address = %(token)s OR token_1_mint_address = %(token)s)
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching fee collection events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch fee collections for token {token_address}: {e}")
+            return []
+    def fetch_migrations_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches migration records for a given token up to T_cutoff.
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            mint_address,
+            virtual_pool_address,
+            pool_address,
+            migrated_base_liquidity,
+            migrated_quote_liquidity
+        FROM migrations
+        WHERE mint_address = %(token)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching migrations for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch migrations for token {token_address}: {e}")
+            return []
+    def fetch_burns_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches burn events for a given token up to T_cutoff.
+        Schema: burns(timestamp, signature, slot, success, error, priority_fee, mint_address, source, amount, amount_decimal, source_balance)
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            mint_address,
+            source,
+            amount,
+            amount_decimal,
+            source_balance
+        FROM burns
+        WHERE mint_address = %(token)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching burn events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch burns for token {token_address}: {e}")
+            return []
+    def fetch_supply_locks_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> List[Dict[str, Any]]:
+        """
+        Fetches supply lock events for a given token up to T_cutoff.
+        Schema: supply_locks(timestamp, signature, slot, success, error, priority_fee, protocol, contract_address, sender, recipient, mint_address, total_locked_amount, final_unlock_timestamp)
+        """
+        if not token_address:
+            return []
+        query = """
+        SELECT
+            timestamp,
+            signature,
+            slot,
+            success,
+            error,
+            priority_fee,
+            protocol,
+            contract_address,
+            sender,
+            recipient,
+            mint_address,
+            total_locked_amount,
+            final_unlock_timestamp
+        FROM supply_locks
+        WHERE mint_address = %(token)s
+          AND timestamp <= %(T_cutoff)s
+        ORDER BY timestamp ASC
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Fetching supply lock events for {token_address}.")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch supply locks for token {token_address}: {e}")
+            return []
+    def fetch_token_holders_for_snapshot(self, token_address: str, T_cutoff: datetime.datetime, limit: int = 200) -> List[Dict[str, Any]]:
+        """
+        Fetch top holders for a token at or before T_cutoff for snapshot purposes.
+        Returns rows with wallet_address and current_balance (>0), ordered by balance desc.
+        """
+        if not token_address:
+            return []
+        query = """
+        WITH point_in_time_holdings AS (
+            SELECT *,
+                   ROW_NUMBER() OVER(PARTITION BY wallet_address, mint_address ORDER BY updated_at DESC) as rn_per_holding
+            FROM wallet_holdings
+            WHERE mint_address = %(token)s AND updated_at <= %(T_cutoff)s
+        )
+        SELECT wallet_address, current_balance
+        FROM point_in_time_holdings
+        WHERE rn_per_holding = 1 AND current_balance > 0
+        ORDER BY current_balance DESC
+        LIMIT %(limit)s;
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff, 'limit': int(limit)}
+        # print(f"INFO: Fetching top holders for snapshot for {token_address} (limit {limit}).")
+        try:
+            rows, columns_info = self.db_client.execute(query, params, with_column_types=True)
+            if not rows:
+                return []
+            columns = [col[0] for col in columns_info]
+            return [dict(zip(columns, row)) for row in rows]
+        except Exception as e:
+            print(f"ERROR: Failed to fetch token holders for {token_address}: {e}")
+            return []
+    def fetch_total_holders_count_for_token(self, token_address: str, T_cutoff: datetime.datetime) -> int:
+        """
+        Returns the total number of wallets holding the token (current_balance > 0)
+        at or before T_cutoff.
+        """
+        if not token_address:
+            return 0
+        query = """
+        WITH point_in_time_holdings AS (
+            SELECT *,
+                   ROW_NUMBER() OVER(PARTITION BY wallet_address, mint_address ORDER BY updated_at DESC) as rn_per_holding
+            FROM wallet_holdings
+            WHERE mint_address = %(token)s AND updated_at <= %(T_cutoff)s
+        )
+        SELECT count()
+        FROM point_in_time_holdings
+        WHERE rn_per_holding = 1 AND current_balance > 0;
+        """
+        params = {'token': token_address, 'T_cutoff': T_cutoff}
+        # print(f"INFO: Counting total holders for {token_address} at timestamp {T_cutoff}.")
+        try:
+            rows = self.db_client.execute(query, params)
+            if not rows:
+                return 0
+            return int(rows[0][0])
+        except Exception as e:
+            print(f"ERROR: Failed to count total holders for token {token_address}: {e}")
+            return 0
+    def fetch_holder_snapshot_stats_for_token(self, token_address: str, T_cutoff: datetime.datetime, limit: int = 200) -> Tuple[int, List[Dict[str, Any]]]:
+        """
+        Fetch total holder count at a point in time.
+        Returns (count, top_holders_list).
+        Uses the indexed wallet_holdings table directly - efficient due to mint_address filter.
+        """
+        if not token_address:
+            return 0, []
+        holder_count = self.fetch_total_holders_count_for_token(token_address, T_cutoff)
+        return holder_count, []
+    def fetch_raw_token_data(
+        self,
+        token_address: str,
+        creator_address: str,
+        mint_timestamp: datetime.datetime,
+        max_horizon_seconds: int = 3600,
+        include_wallet_data: bool = True,
+        include_graph: bool = True,
+        min_trades: int = 0,
+        full_history: bool = False,
+        prune_failed: bool = False,
+        prune_transfers: bool = False
+    ) -> Optional[Dict[str, Any]]:
+        """
+        Fetches ALL available data for a token up to the maximum horizon.
+        This data is agnostic of T_cutoff and will be masked/filtered dynamically during training.
+        Wallet/graph data can be skipped to avoid caching T_cutoff-dependent features.
+        Args:
+            full_history: If True, fetches ALL trades ignoring H/B/H limits.
+            prune_failed: If True, filters out failed trades from the result.
+            prune_transfers: If True, skips fetching transfers entirely.
+        """
+        # 1. Calculate the absolute maximum timestamp we care about (mint + max_horizon)
+        # We fetch everything up to this point.
+        max_limit_time = mint_timestamp + datetime.timedelta(seconds=max_horizon_seconds)
+        # 2. Fetch all trades up to max_limit_time
+        # Note: We pass None as T_cutoff to fetch_trades_for_token if we want *everything*,
+        # but here we likely want to bound it by our max training horizon to avoid fetching months of data.
+        # However, the existing method signature expects T_cutoff.
+        # So we pass max_limit_time as the "cutoff" for the purpose of raw data collection.
+        # We use a large enough limit to get all relevant trades for the session
+        # If full_history is True, these limits are ignored inside the method.
+        early_trades, middle_trades, recent_trades = self.fetch_trades_for_token(
+            token_address, max_limit_time, 30000, 10000, 15000, full_history=full_history
+        )
+        # Combine and deduplicate trades
+        all_trades = {}
+        for t in early_trades + middle_trades + recent_trades:
+            # key: (slot, tx_idx, instr_idx)
+            key = (t.get('slot'), t.get('transaction_index'), t.get('instruction_index'), t.get('signature'))
+            all_trades[key] = t
+        sorted_trades = sorted(list(all_trades.values()), key=lambda x: x['timestamp'])
+        # --- PRUNING FAILED TRADES ---
+        if prune_failed:
+            original_count = len(sorted_trades)
+            sorted_trades = [t for t in sorted_trades if t.get('success', False)]
+            if len(sorted_trades) < original_count:
+                # print(f"  INFO: Pruned {original_count - len(sorted_trades)} failed trades.")
+                pass
+        if len(sorted_trades) < min_trades:
+            print(f"  SKIP: Token {token_address} has only {len(sorted_trades)} trades (min required: {min_trades}). skipping fetches.")
+            return None
+        # 3. Fetch other events
+        # --- PRUNING TRANSFERS ---
+        if prune_transfers:
+            transfers = []
+            # print("  INFO: Pruning transfers (skipping fetch).")
+        else:
+            transfers = self.fetch_transfers_for_token(token_address, max_limit_time, 0.0) # 0.0 means fetch all
+        pool_creations = self.fetch_pool_creations_for_token(token_address, max_limit_time)
+        # Collect pool addresses to fetch liquidity changes
+        pool_addresses = [p['pool_address'] for p in pool_creations if p.get('pool_address')]
+        liquidity_changes = []
+        if pool_addresses:
+            liquidity_changes = self.fetch_liquidity_changes_for_pools(pool_addresses, max_limit_time)
+        fee_collections = self.fetch_fee_collections_for_token(token_address, max_limit_time)
+        burns = self.fetch_burns_for_token(token_address, max_limit_time)
+        supply_locks = self.fetch_supply_locks_for_token(token_address, max_limit_time)
+        migrations = self.fetch_migrations_for_token(token_address, max_limit_time)
+        profile_data = {}
+        social_data = {}
+        holdings_data = {}
+        deployed_token_details = {}
+        fetched_graph_entities = {}
+        graph_links = {}
+        unique_wallets = set()
+        if include_wallet_data or include_graph:
+            # Identify wallets that interacted with the token up to max_limit_time.
+            unique_wallets.add(creator_address)
+            for t in sorted_trades:
+                if t.get('maker'):
+                    unique_wallets.add(t['maker'])
+            for t in transfers:
+                if t.get('source'):
+                    unique_wallets.add(t['source'])
+                if t.get('destination'):
+                    unique_wallets.add(t['destination'])
+            for p in pool_creations:
+                if p.get('creator_address'):
+                    unique_wallets.add(p['creator_address'])
+            for l in liquidity_changes:
+                if l.get('lp_provider'):
+                    unique_wallets.add(l['lp_provider'])
+        if include_wallet_data and unique_wallets:
+            # Profiles/holdings are time-dependent; only fetch if explicitly requested.
+            profile_data, social_data = self.fetch_wallet_profiles_and_socials(list(unique_wallets), max_limit_time)
+            holdings_data = self.fetch_wallet_holdings(list(unique_wallets), max_limit_time)
+            all_deployed_tokens = set()
+            for profile in profile_data.values():
+                all_deployed_tokens.update(profile.get('deployed_tokens', []))
+            if all_deployed_tokens:
+                deployed_token_details = self.fetch_deployed_token_details(list(all_deployed_tokens), max_limit_time)
+        if include_graph and unique_wallets:
+            graph_seed_wallets = list(unique_wallets)
+            if len(graph_seed_wallets) > 100:
+                pass
+            fetched_graph_entities, graph_links = self.fetch_graph_links(
+                graph_seed_wallets,
+                max_limit_time,
+                max_degrees=1
+            )
+        return {
+            "token_address": token_address,
+            "creator_address": creator_address,
+            "mint_timestamp": mint_timestamp,
+            "max_limit_time": max_limit_time,
+            "trades": sorted_trades,
+            "transfers": transfers,
+            "pool_creations": pool_creations,
+            "liquidity_changes": liquidity_changes,
+            "fee_collections": fee_collections,
+            "burns": burns,
+            "supply_locks": supply_locks,
+            "migrations": migrations,
+            "profiles": profile_data,
+            "socials": social_data,
+            "holdings": holdings_data,
+            "deployed_token_details": deployed_token_details,
+            "graph_entities": fetched_graph_entities,
+            "graph_links": graph_links
+        }

data/data_collator.py CHANGED Viewed

@@ -144,23 +144,32 @@ class MemecoinCollator:
             item_wallet_addr_to_global_idx = {addr: wallet_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_wallets.keys()}
             item_token_addr_to_global_idx = {addr: token_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_tokens.keys()}
             for link_name, data in item.get('graph_links', {}).items():
-                aggregated_links[link_name]['links_list'].extend(data.get('links', []))
                 triplet = vocab.LINK_NAME_TO_TRIPLET.get(link_name)
                 if not triplet: continue
                 src_type, _, dst_type = triplet
                 edges = data.get('edges')
-                if not edges: continue
                 src_map = item_wallet_addr_to_global_idx if src_type == 'wallet' else item_token_addr_to_global_idx
                 dst_map = item_wallet_addr_to_global_idx if dst_type == 'wallet' else item_token_addr_to_global_idx
                 remapped_edge_list = []
-                for src_addr, dst_addr in edges:
                     src_idx_global = src_map.get(src_addr, self.entity_pad_idx)
                     dst_idx_global = dst_map.get(dst_addr, self.entity_pad_idx)
                     if src_idx_global != self.entity_pad_idx and dst_idx_global != self.entity_pad_idx:
                         remapped_edge_list.append([src_idx_global, dst_idx_global])
                 if remapped_edge_list:
                     remapped_edge_tensor = torch.tensor(remapped_edge_list, device=self.device, dtype=torch.long).t()
                     aggregated_links[link_name]['edge_index_list'].append(remapped_edge_tensor)
                 if link_name == "TransferLink":
                     link_props = data.get('links', [])
                     derived_edges = []
@@ -737,7 +746,7 @@ class MemecoinCollator:
             # Labels
             'labels': torch.stack([item['labels'] for item in batch]) if batch and 'labels' in batch[0] else None,
             'labels_mask': torch.stack([item['labels_mask'] for item in batch]) if batch and 'labels_mask' in batch[0] else None,
-            'quality_score': torch.stack([item['quality_score'] for item in batch]) if batch and 'quality_score' in batch[0] else None,
             'class_id': torch.tensor([item.get('class_id', 0) for item in batch], dtype=torch.long),
             # Debug info
             'token_addresses': [item.get('token_address', 'unknown') for item in batch],

             item_wallet_addr_to_global_idx = {addr: wallet_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_wallets.keys()}
             item_token_addr_to_global_idx = {addr: token_addr_to_batch_idx.get(addr, self.entity_pad_idx) for addr in item_tokens.keys()}
             for link_name, data in item.get('graph_links', {}).items():
+                # aggregated_links[link_name]['links_list'].extend(data.get('links', [])) - REMOVED: Now handled inside the loop for sync
                 triplet = vocab.LINK_NAME_TO_TRIPLET.get(link_name)
                 if not triplet: continue
                 src_type, _, dst_type = triplet
                 edges = data.get('edges')
+                link_props_list = data.get('links', [])
+                if not edges or not link_props_list: continue
                 src_map = item_wallet_addr_to_global_idx if src_type == 'wallet' else item_token_addr_to_global_idx
                 dst_map = item_wallet_addr_to_global_idx if dst_type == 'wallet' else item_token_addr_to_global_idx
                 remapped_edge_list = []
+                valid_link_props = []
+                for (src_addr, dst_addr), props in zip(edges, link_props_list):
                     src_idx_global = src_map.get(src_addr, self.entity_pad_idx)
                     dst_idx_global = dst_map.get(dst_addr, self.entity_pad_idx)
                     if src_idx_global != self.entity_pad_idx and dst_idx_global != self.entity_pad_idx:
                         remapped_edge_list.append([src_idx_global, dst_idx_global])
+                        valid_link_props.append(props)
                 if remapped_edge_list:
                     remapped_edge_tensor = torch.tensor(remapped_edge_list, device=self.device, dtype=torch.long).t()
                     aggregated_links[link_name]['edge_index_list'].append(remapped_edge_tensor)
+                    aggregated_links[link_name]['links_list'].extend(valid_link_props)
                 if link_name == "TransferLink":
                     link_props = data.get('links', [])
                     derived_edges = []
             # Labels
             'labels': torch.stack([item['labels'] for item in batch]) if batch and 'labels' in batch[0] else None,
             'labels_mask': torch.stack([item['labels_mask'] for item in batch]) if batch and 'labels_mask' in batch[0] else None,
+            'quality_score': torch.stack([item['quality_score'] if isinstance(item['quality_score'], torch.Tensor) else torch.tensor(item['quality_score'], dtype=torch.float32) for item in batch]) if batch and 'quality_score' in batch[0] else None,
             'class_id': torch.tensor([item.get('class_id', 0) for item in batch], dtype=torch.long),
             # Debug info
             'token_addresses': [item.get('token_address', 'unknown') for item in batch],

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:499481b1050c456cb48eddbfd2a4437c8b686715e8eec7c74e8edf2b43191591
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:3af6751fb5666ccfd4c61d27c549e5fcd71d964090836f9d3646d6f1d63224c0
 size 1660

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9a102d8b3b6d3be1c81eac0be542ca3f91e17b4612ac00b50843669fa4e38ba5
-size 57319

 version https://git-lfs.github.com/spec/v1
+oid sha256:41a901f956af52a553855651ff68f78a817ad4fa5b108efde1034e22a16724a0
+size 4577

models/graph_updater.py CHANGED Viewed

@@ -400,10 +400,10 @@ class GraphUpdater(nn.Module):
             # Use vocabulary to get the triplet (src, rel, dst)
             # Make sure ID_TO_LINK_TYPE is correctly populated
-            if link_name not in vocabulary.LINK_NAME_TO_TRIPLET:
                 print(f"Warning: Link name '{link_name}' not found in vocabulary.LINK_NAME_TO_TRIPLET. Skipping.")
                 continue
-            src_type, rel_type, dst_type = vocabulary.LINK_NAME_TO_TRIPLET[link_name]
             # Check if encoder exists for this link name
             if link_name not in self.edge_encoders:
@@ -466,10 +466,9 @@ class GraphUpdater(nn.Module):
                     print(f"Warning: Relation '{rel_type}' missing in block {block_key}. Skipping.")
                     continue
-                # *** THE FIX ***
-                # Use scatter_add_ to accumulate messages for the destination node type.
-                # This correctly handles multiple edge types pointing to the same node type.
-                msg_aggregates[dst_type].scatter_add_(0, edge_index[1].unsqueeze(1).expand_as(messages), messages)
             # --- Aggregation & Update (Residual Connection) ---
             x_next = {}

             # Use vocabulary to get the triplet (src, rel, dst)
             # Make sure ID_TO_LINK_TYPE is correctly populated
+            if link_name not in models.vocabulary.LINK_NAME_TO_TRIPLET:
                 print(f"Warning: Link name '{link_name}' not found in vocabulary.LINK_NAME_TO_TRIPLET. Skipping.")
                 continue
+            src_type, rel_type, dst_type = models.vocabulary.LINK_NAME_TO_TRIPLET[link_name]
             # Check if encoder exists for this link name
             if link_name not in self.edge_encoders:
                     print(f"Warning: Relation '{rel_type}' missing in block {block_key}. Skipping.")
                     continue
+                # GATv2Conv output is already per-destination-node (shape [num_dst_nodes, node_dim])
+                # NOT per-edge. So we directly accumulate, no scatter needed.
+                msg_aggregates[dst_type] += messages
             # --- Aggregation & Update (Residual Connection) ---
             x_next = {}

sample_12LJX4a83B4tCuZ1_3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/.ipynb_checkpoints/cache_dataset-checkpoint.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import os
+import sys
+import argparse
+import numpy as np
+import datetime
+import torch
+import json
+import math
+from pathlib import Path
+from tqdm import tqdm
+from dotenv import load_dotenv
+import huggingface_hub
+import logging
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing as mp
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("transformers").setLevel(logging.ERROR)
+logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from scripts.analyze_distribution import get_return_class_map
+from scripts.compute_quality_score import get_token_quality_scores, fetch_token_metrics, _bucket_id, _midrank_percentiles, EPS
+from clickhouse_driver import Client as ClickHouseClient
+from neo4j import GraphDatabase
+_worker_dataset = None
+_worker_return_class_map = None
+_worker_quality_scores_map = None
+def _init_worker(db_config, dataset_config, return_class_map, quality_scores_map):
+    global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
+    from data.data_loader import OracleDataset
+    from data.data_fetcher import DataFetcher
+    clickhouse_client = ClickHouseClient(host=db_config['clickhouse_host'], port=db_config['clickhouse_port'])
+    neo4j_driver = GraphDatabase.driver(db_config['neo4j_uri'], auth=(db_config['neo4j_user'], db_config['neo4j_password']))
+    data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+    _worker_dataset = OracleDataset(
+        data_fetcher=data_fetcher,
+        max_samples=dataset_config['max_samples'],
+        start_date=dataset_config['start_date'],
+        ohlc_stats_path=dataset_config['ohlc_stats_path'],
+        horizons_seconds=dataset_config['horizons_seconds'],
+        quantiles=dataset_config['quantiles'],
+        min_trade_usd=dataset_config['min_trade_usd'],
+        max_seq_len=dataset_config['max_seq_len']
+    )
+    _worker_dataset.sampled_mints = dataset_config['sampled_mints']
+    _worker_return_class_map = return_class_map
+    _worker_quality_scores_map = quality_scores_map
+def _process_single_token_context(args):
+    idx, mint_addr, samples_per_token, output_dir = args
+    global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
+    try:
+        class_id = _worker_return_class_map.get(mint_addr)
+        if class_id is None:
+            return {'status': 'skipped', 'reason': 'not in class map', 'mint': mint_addr}
+        contexts = _worker_dataset.__cacheitem_context__(idx, num_samples_per_token=samples_per_token)
+        if not contexts:
+            return {'status': 'skipped', 'reason': 'no valid contexts', 'mint': mint_addr}
+        q_score = _worker_quality_scores_map.get(mint_addr)
+        if q_score is None:
+            return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
+        saved_files = []
+        for ctx_idx, ctx in enumerate(contexts):
+            ctx["quality_score"] = q_score
+            ctx["class_id"] = class_id
+            ctx["source_token"] = mint_addr
+            ctx["cache_mode"] = "context"
+            filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
+            output_path = Path(output_dir) / filename
+            torch.save(ctx, output_path)
+            saved_files.append(filename)
+        return {'status': 'success', 'mint': mint_addr, 'class_id': class_id, 'q_score': q_score, 'n_contexts': len(contexts), 'n_events': len(contexts[0].get('event_sequence', [])) if contexts else 0, 'files': saved_files}
+    except Exception as e:
+        import traceback
+        return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
+def _process_single_token_raw(args):
+    idx, mint_addr, output_dir = args
+    global _worker_dataset, _worker_return_class_map, _worker_quality_scores_map
+    try:
+        class_id = _worker_return_class_map.get(mint_addr)
+        if class_id is None:
+            return {'status': 'skipped', 'reason': 'not in class map', 'mint': mint_addr}
+        item = _worker_dataset.__cacheitem__(idx)
+        if item is None:
+            return {'status': 'skipped', 'reason': 'cacheitem returned None', 'mint': mint_addr}
+        q_score = _worker_quality_scores_map.get(mint_addr)
+        if q_score is None:
+            return {'status': 'skipped', 'reason': 'no quality score', 'mint': mint_addr}
+        item["quality_score"] = q_score
+        item["class_id"] = class_id
+        item["cache_mode"] = "raw"
+        filename = f"sample_{mint_addr[:16]}.pt"
+        output_path = Path(output_dir) / filename
+        torch.save(item, output_path)
+        return {'status': 'success', 'mint': mint_addr, 'class_id': class_id, 'q_score': q_score, 'n_trades': len(item.get('trades', [])), 'files': [filename]}
+    except Exception as e:
+        import traceback
+        return {'status': 'error', 'mint': mint_addr, 'error': str(e), 'traceback': traceback.format_exc()}
+def compute_save_ohlc_stats(client, output_path):
+    print(f"INFO: Computing OHLC stats...")
+    query = """SELECT AVG(t.price_usd), stddevPop(t.price_usd), AVG(t.price), stddevPop(t.price), AVG(t.total_usd), stddevPop(t.total_usd) FROM trades AS t WHERE t.price_usd > 0 AND t.total_usd > 0"""
+    try:
+        result = client.execute(query)
+        if result and result[0]:
+            row = result[0]
+            stats = {"mean_price_usd": float(row[0] or 0), "std_price_usd": float(row[1] or 1), "mean_price_native": float(row[2] or 0), "std_price_native": float(row[3] or 1), "mean_trade_value_usd": float(row[4] or 0), "std_trade_value_usd": float(row[5] or 1)}
+        else:
+            stats = {"mean_price_usd": 0.0, "std_price_usd": 1.0, "mean_price_native": 0.0, "std_price_native": 1.0, "mean_trade_value_usd": 0.0, "std_trade_value_usd": 1.0}
+        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+        np.savez(output_path, **stats)
+        print(f"INFO: Saved OHLC stats to {output_path}")
+    except Exception as e:
+        print(f"ERROR: Failed to compute OHLC stats: {e}")
+def main():
+    load_dotenv()
+    mp.set_start_method('spawn', force=True)
+    hf_token = os.getenv("HF_TOKEN")
+    if hf_token:
+        print(f"INFO: Logging in to Hugging Face...")
+        huggingface_hub.login(token=hf_token)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", type=str, default="data/cache")
+    parser.add_argument("--max_samples", type=int, default=None)
+    parser.add_argument("--start_date", type=str, default=None)
+    parser.add_argument("--ohlc_stats_path", type=str, default="data/ohlc_stats.npz")
+    parser.add_argument("--min_trade_usd", type=float, default=0.0)
+    parser.add_argument("--cache_mode", type=str, default="raw", choices=["raw", "context"])
+    parser.add_argument("--context_length", type=int, default=8192)
+    parser.add_argument("--min_trades", type=int, default=10)
+    parser.add_argument("--samples_per_token", type=int, default=1)
+    parser.add_argument("--horizons_seconds", type=int, nargs="+", default=[30, 60, 120, 240, 420])
+    parser.add_argument("--quantiles", type=float, nargs="+", default=[0.1, 0.5, 0.9])
+    parser.add_argument("--num_workers", type=int, default=1)
+    parser.add_argument("--clickhouse_host", type=str, default=os.getenv("CLICKHOUSE_HOST", "localhost"))
+    parser.add_argument("--clickhouse_port", type=int, default=int(os.getenv("CLICKHOUSE_PORT", 9000)))
+    parser.add_argument("--neo4j_uri", type=str, default=os.getenv("NEO4J_URI", "bolt://localhost:7687"))
+    parser.add_argument("--neo4j_user", type=str, default=os.getenv("NEO4J_USER", "neo4j"))
+    parser.add_argument("--neo4j_password", type=str, default=os.getenv("NEO4J_PASSWORD", "password"))
+    args = parser.parse_args()
+    if args.num_workers == 0:
+        args.num_workers = max(1, mp.cpu_count() - 4)
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    start_date_dt = datetime.datetime.strptime(args.start_date, "%Y-%m-%d") if args.start_date else None
+    print(f"INFO: Initializing DB Connections...")
+    clickhouse_client = ClickHouseClient(host=args.clickhouse_host, port=args.clickhouse_port)
+    neo4j_driver = GraphDatabase.driver(args.neo4j_uri, auth=(args.neo4j_user, args.neo4j_password))
+    try:
+        compute_save_ohlc_stats(clickhouse_client, args.ohlc_stats_path)
+        from data.data_loader import OracleDataset
+        from data.data_fetcher import DataFetcher
+        data_fetcher = DataFetcher(clickhouse_client=clickhouse_client, neo4j_driver=neo4j_driver)
+        print("INFO: Fetching Return Classification Map...")
+        return_class_map, _ = get_return_class_map(clickhouse_client)
+        print(f"INFO: Loaded {len(return_class_map)} classified tokens.")
+        print("INFO: Fetching Quality Scores...")
+        quality_scores_map = get_token_quality_scores(clickhouse_client)
+        print(f"INFO: Loaded {len(quality_scores_map)} quality scores.")
+        dataset = OracleDataset(data_fetcher=data_fetcher, max_samples=args.max_samples, start_date=start_date_dt, ohlc_stats_path=args.ohlc_stats_path, horizons_seconds=args.horizons_seconds, quantiles=args.quantiles, min_trade_usd=args.min_trade_usd, max_seq_len=args.context_length)
+        if len(dataset) == 0:
+            print("WARNING: No samples. Exiting.")
+            return
+        # Filter mints by return_class_map
+        original_size = len(dataset.sampled_mints)
+        filtered_mints = [m for m in dataset.sampled_mints if m['mint_address'] in return_class_map]
+        print(f"INFO: Filtered by class map: {original_size} -> {len(filtered_mints)} tokens")
+        # Pre-filter: only keep tokens with >= min_trades trades (fast ClickHouse count query)
+        print(f"INFO: Pre-filtering tokens by trade count (>= {args.min_trades} trades)...")
+        trade_counts = clickhouse_client.execute("""
+            SELECT base_address, count() as cnt
+            FROM trades
+            GROUP BY base_address
+            HAVING cnt >= %(min_trades)s
+        """, {'min_trades': args.min_trades})
+        valid_tokens = {row[0] for row in trade_counts}
+        pre_filter_size = len(filtered_mints)
+        filtered_mints = [m for m in filtered_mints if m['mint_address'] in valid_tokens]
+        print(f"INFO: Pre-filtered by trade count: {pre_filter_size} -> {len(filtered_mints)} tokens (removed {pre_filter_size - len(filtered_mints)} with < {args.min_trades} trades)")
+        # Also filter by quality score availability
+        pre_quality_size = len(filtered_mints)
+        filtered_mints = [m for m in filtered_mints if m['mint_address'] in quality_scores_map]
+        print(f"INFO: Filtered by quality score: {pre_quality_size} -> {len(filtered_mints)} tokens")
+        if len(filtered_mints) == 0:
+            print("WARNING: No tokens after filtering.")
+            return
+        print(f"INFO: Cache mode: {args.cache_mode}, Workers: {args.num_workers}")
+        db_config = {'clickhouse_host': args.clickhouse_host, 'clickhouse_port': args.clickhouse_port, 'neo4j_uri': args.neo4j_uri, 'neo4j_user': args.neo4j_user, 'neo4j_password': args.neo4j_password}
+        dataset_config = {'max_samples': args.max_samples, 'start_date': start_date_dt, 'ohlc_stats_path': args.ohlc_stats_path, 'horizons_seconds': args.horizons_seconds, 'quantiles': args.quantiles, 'min_trade_usd': args.min_trade_usd, 'max_seq_len': args.context_length, 'sampled_mints': filtered_mints}
+        # Build tasks with class-aware multi-sampling for balanced cache
+        import random
+        from collections import Counter, defaultdict
+        # Count eligible tokens per class
+        eligible_class_counts = Counter()
+        mints_by_class = defaultdict(list)
+        for i, m in enumerate(filtered_mints):
+            cid = return_class_map.get(m['mint_address'])
+            if cid is not None:
+                eligible_class_counts[cid] += 1
+                mints_by_class[cid].append((i, m))
+        print(f"INFO: Eligible tokens per class: {dict(sorted(eligible_class_counts.items()))}")
+        # Compute balanced samples_per_token for each class
+        num_classes = len(eligible_class_counts)
+        if args.max_samples:
+            target_total = args.max_samples
+        else:
+            target_total = 15000  # Default target: 15k balanced files
+        target_per_class = target_total // max(num_classes, 1)
+        class_multipliers = {}
+        class_token_caps = {}
+        for cid, count in eligible_class_counts.items():
+            if count >= target_per_class:
+                # Enough tokens — 1 sample each, cap token count
+                class_multipliers[cid] = 1
+                class_token_caps[cid] = target_per_class
+            else:
+                # Not enough tokens — multi-sample, use all tokens
+                class_multipliers[cid] = min(10, max(1, math.ceil(target_per_class / max(count, 1))))
+                class_token_caps[cid] = count
+        print(f"INFO: Target total: {target_total}, Target per class: {target_per_class}")
+        print(f"INFO: Class multipliers: {dict(sorted(class_multipliers.items()))}")
+        print(f"INFO: Class token caps: {dict(sorted(class_token_caps.items()))}")
+        # Build balanced task list
+        tasks = []
+        for cid, mint_list in mints_by_class.items():
+            random.shuffle(mint_list)
+            cap = class_token_caps.get(cid, len(mint_list))
+            spt = class_multipliers.get(cid, 1)
+            # Override with CLI --samples_per_token if explicitly set > 1
+            if args.samples_per_token > 1:
+                spt = args.samples_per_token
+            for i, m in mint_list[:cap]:
+                mint_addr = m['mint_address']
+                if args.cache_mode == "context":
+                    tasks.append((i, mint_addr, spt, str(output_dir)))
+                else:
+                    tasks.append((i, mint_addr, str(output_dir)))
+        random.shuffle(tasks)  # Shuffle tasks for even load distribution across workers
+        expected_files = sum(
+            class_multipliers.get(cid, 1) * min(class_token_caps.get(cid, len(ml)), len(ml))
+            for cid, ml in mints_by_class.items()
+        )
+        print(f"INFO: Total tasks: {len(tasks)} (expected ~{expected_files} output files, target ~{target_total})")
+        success_count, skipped_count, error_count = 0, 0, 0
+        class_distribution = {}
+        # --- Resume support: skip tokens that already have cached files ---
+        existing_files = set(f.name for f in output_dir.glob("sample_*.pt"))
+        if existing_files:
+            pre_resume = len(tasks)
+            filtered_tasks = []
+            already_cached = 0
+            for task in tasks:
+                mint_addr = task[1]  # task = (idx, mint_addr, ...)
+                # Check if any file exists for this mint (context mode: sample_MINT_0.pt, raw mode: sample_MINT.pt)
+                mint_prefix = f"sample_{mint_addr[:16]}"
+                has_cached = any(ef.startswith(mint_prefix) for ef in existing_files)
+                if has_cached:
+                    already_cached += 1
+                    # Count existing files toward class distribution
+                    cid = return_class_map.get(mint_addr)
+                    if cid is not None:
+                        class_distribution[cid] = class_distribution.get(cid, 0) + 1
+                    success_count += 1
+                else:
+                    filtered_tasks.append(task)
+            tasks = filtered_tasks
+            print(f"INFO: Resume: {already_cached} tokens already cached, {len(tasks)} remaining (was {pre_resume})")
+        print(f"INFO: Starting to cache {len(tasks)} tokens...")
+        process_fn = _process_single_token_context if args.cache_mode == "context" else _process_single_token_raw
+        import time as _time
+        def _log_progress(task_num, total, start_time, recent_times, success_count, skipped_count, error_count):
+            """Print progress with rolling ETA every 10 tokens."""
+            if (task_num + 1) % 10 == 0 and recent_times:
+                avg_time = sum(recent_times) / len(recent_times)
+                remaining = total - (task_num + 1)
+                eta_seconds = avg_time * remaining
+                eta_hours = eta_seconds / 3600
+                wall_elapsed = _time.perf_counter() - start_time
+                speed = (task_num + 1) / wall_elapsed
+                tqdm.write(
+                    f"  [PROGRESS] {task_num+1}/{total} | "
+                    f"Speed: {speed:.1f} tok/s ({speed*60:.0f} tok/min) | "
+                    f"Avg: {avg_time:.1f}s/tok | "
+                    f"ETA: {eta_hours:.1f}h | "
+                    f"OK: {success_count} Skip: {skipped_count} Err: {error_count}"
+                )
+        # Error log file for diagnosing failures
+        error_log_path = Path(args.output_dir) / "cache_errors.log"
+        error_samples = []  # First 20 unique error messages
+        if args.num_workers == 1:
+            print("INFO: Single-threaded mode...")
+            _init_worker(db_config, dataset_config, return_class_map, quality_scores_map)
+            start_time = _time.perf_counter()
+            recent_times = []
+            for task_num, task in enumerate(tqdm(tasks, desc="Caching", unit="tok")):
+                t0 = _time.perf_counter()
+                result = process_fn(task)
+                elapsed = _time.perf_counter() - t0
+                recent_times.append(elapsed)
+                if len(recent_times) > 50:
+                    recent_times.pop(0)
+                if result['status'] == 'success':
+                    success_count += 1
+                    class_distribution[result['class_id']] = class_distribution.get(result['class_id'], 0) + 1
+                elif result['status'] == 'skipped':
+                    skipped_count += 1
+                else:
+                    error_count += 1
+                    err_msg = result.get('error', 'unknown')
+                    tqdm.write(f"ERROR: {result['mint'][:16]} - {err_msg}")
+                    if len(error_samples) < 20:
+                        error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
+                _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
+        else:
+            print(f"INFO: Running with {args.num_workers} workers...")
+            start_time = _time.perf_counter()
+            recent_times = []
+            with ProcessPoolExecutor(max_workers=args.num_workers, initializer=_init_worker, initargs=(db_config, dataset_config, return_class_map, quality_scores_map)) as executor:
+                futures = {executor.submit(process_fn, task): task for task in tasks}
+                for task_num, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Caching", unit="tok")):
+                    t0 = _time.perf_counter()
+                    try:
+                        result = future.result(timeout=300)
+                        elapsed = _time.perf_counter() - t0
+                        recent_times.append(elapsed)
+                        if len(recent_times) > 50:
+                            recent_times.pop(0)
+                        if result['status'] == 'success':
+                            success_count += 1
+                            class_distribution[result['class_id']] = class_distribution.get(result['class_id'], 0) + 1
+                        elif result['status'] == 'skipped':
+                            skipped_count += 1
+                        else:
+                            error_count += 1
+                            err_msg = result.get('error', 'unknown')
+                            if len(error_samples) < 20:
+                                error_samples.append({'mint': result.get('mint'), 'error': err_msg, 'traceback': result.get('traceback', '')})
+                            if error_count <= 5:
+                                tqdm.write(f"ERROR: {result.get('mint', '?')[:16]} - {err_msg}")
+                    except Exception as e:
+                        error_count += 1
+                        tqdm.write(f"WORKER ERROR: {e}")
+                    _log_progress(task_num, len(tasks), start_time, recent_times, success_count, skipped_count, error_count)
+        # Write error log
+        if error_samples:
+            with open(error_log_path, 'w') as ef:
+                for i, es in enumerate(error_samples):
+                    ef.write(f"=== Error {i+1} === Token: {es['mint']}\n")
+                    ef.write(f"Error: {es['error']}\n")
+                    ef.write(f"Traceback:\n{es['traceback']}\n\n")
+            print(f"INFO: First {len(error_samples)} error tracebacks saved to {error_log_path}")
+        print("INFO: Building metadata...")
+        file_class_map = {}
+        for f in sorted(output_dir.glob("sample_*.pt")):
+            try:
+                file_class_map[f.name] = torch.load(f, map_location="cpu", weights_only=False).get("class_id", 0)
+            except:
+                pass
+        with open(output_dir / "class_metadata.json", 'w') as f:
+            json.dump({
+                'file_class_map': file_class_map,
+                'class_distribution': {str(k): v for k, v in class_distribution.items()},
+                'cache_mode': args.cache_mode,
+                'num_workers': args.num_workers,
+                'horizons_seconds': args.horizons_seconds,
+                'quantiles': args.quantiles,
+                'class_multipliers': {str(k): v for k, v in class_multipliers.items()},
+                'class_token_caps': {str(k): v for k, v in class_token_caps.items()},
+                'target_total': target_total,
+                'target_per_class': target_per_class,
+            }, f, indent=2)
+        print(f"\n--- Done ---\nSuccess: {success_count}, Skipped: {skipped_count}, Errors: {error_count}\nFiles: {len(file_class_map)}\nLocation: {output_dir.resolve()}")
+    finally:
+        clickhouse_client.disconnect()
+        neo4j_driver.close()
+if __name__ == "__main__":
+    main()

scripts/analyze_distribution.py CHANGED Viewed

@@ -313,8 +313,108 @@ def print_stats(name, values):
     print(f"  {name}: mean={mean:.4f} p50={p50:.4f} p90={p90:.4f} p99={p99:.4f} nonzero_rate={nonzero_rate:.3f} (n={len(vals)})")
 def analyze():
     client = get_client()
     data = fetch_all_metrics(client)
     final_buckets, thresholds, count_manipulated = _classify_tokens(data)

     print(f"  {name}: mean={mean:.4f} p50={p50:.4f} p90={p90:.4f} p99={p99:.4f} nonzero_rate={nonzero_rate:.3f} (n={len(vals)})")
+def fetch_wallet_pnl_stats(client):
+    print("   -> Fetching Wallet PnL Quantiles (7d, 30d) - Unique per wallet...")
+    # Use argMax to get latest entry per wallet (table is a time-series dump)
+    query = """
+    WITH unique_wallets AS (
+        SELECT
+            wallet_address,
+            argMax(stats_30d_realized_profit_pnl, updated_at) as pnl_30d,
+            argMax(stats_7d_realized_profit_pnl, updated_at) as pnl_7d
+        FROM wallet_profile_metrics
+        GROUP BY wallet_address
+    )
+    SELECT
+        count() as n,
+        countIf(pnl_30d > 0.001) as pos_30d,
+        quantiles(0.5, 0.9, 0.95, 0.99, 0.999)(pnl_30d) as q_30d,
+        max(pnl_30d) as max_30d,
+        countIf(pnl_7d > 0.001) as pos_7d,
+        quantiles(0.5, 0.9, 0.95, 0.99, 0.999)(pnl_7d) as q_7d,
+        max(pnl_7d) as max_7d
+    FROM unique_wallets
+    WHERE pnl_30d > -999 OR pnl_7d > -999
+    """
+    rows = client.execute(query)
+    if not rows: return None
+    return rows[0]
+def fetch_trade_stats(client):
+    print("   -> Fetching Trade Quantiles (USD & Supply %)...")
+    query = """
+    SELECT
+        count() as n,
+        quantiles(0.5, 0.9, 0.95, 0.99, 0.999)(t.total_usd) as q_usd,
+        quantiles(0.5, 0.9, 0.95, 0.99, 0.999)((t.base_amount / m.total_supply) * 100) as q_sup
+    FROM trades t
+    JOIN mints m ON t.base_address = m.mint_address
+    WHERE m.total_supply > 0
+    """
+    rows = client.execute(query)
+    if not rows: return None
+    return rows[0]
+def fetch_kol_stats(client):
+    print("   -> Fetching KOL stats from wallet_socials...")
+    query = """
+    SELECT
+        uniq(wallet_address) as total_wallets,
+        uniqIf(wallet_address, kolscan_name != '' OR cabalspy_name != '' OR axiom_kol_name != '') as kols
+    FROM wallet_socials
+    """
+    rows = client.execute(query)
+    print(f"      (DEBUG) KOL query result: {rows}")
+    if rows:
+        return rows[0]
+    return (0, 0)
+def print_quantiles(name, n, pos_rate, q, max_val=None):
+    # q is list [p50, p90, p95, p99, p999]
+    print(f"\n[{name}] (n={n})")
+    if pos_rate is not None:
+        print(f"  Positive Rate: {pos_rate*100:.1f}%")
+    print(f"  p50={q[0]:.4f}")
+    print(f"  p90={q[1]:.4f}")
+    print(f"  p95={q[2]:.4f}")
+    print(f"  p99={q[3]:.4f}")
+    print(f"  p99.9={q[4]:.4f}")
+    if max_val is not None:
+        print(f"  Max={max_val:.4f}")
+def analyze_thresholds(client):
+    print("\n=== THRESHOLD DISTRIBUTION ANALYSIS (DB-Side) ===")
+    # 1. PnL
+    pnl_row = fetch_wallet_pnl_stats(client)
+    if pnl_row:
+        n, pos_30d, q_30d, max_30d, pos_7d, q_7d, max_7d = pnl_row
+        print_quantiles("Wallet PnL (30d)", n, pos_30d/n if n>0 else 0, q_30d, max_30d)
+        print_quantiles("Wallet PnL (7d)", n, pos_7d/n if n>0 else 0, q_7d, max_7d)
+    # 2. Trades
+    trade_row = fetch_trade_stats(client)
+    if trade_row:
+        n, q_usd, q_sup = trade_row
+        print_quantiles("Trade USD Size", n, None, q_usd)
+        print_quantiles("Trade Supply %", n, None, q_sup)
+    # 3. KOLs
+    total, kols = fetch_kol_stats(client)
+    if total > 0:
+        print("\n[KOL Statistics]")
+        print(f"  Total Wallets with Socials: {total}")
+        print(f"  Identified KOLs: {kols}")
+        print(f"  KOL Ratio: {(kols/total)*100:.2f}%")
 def analyze():
     client = get_client()
+    # Run new analysis first
+    analyze_thresholds(client)
     data = fetch_all_metrics(client)
     final_buckets, thresholds, count_manipulated = _classify_tokens(data)

scripts/dump_cache_sample.py ADDED Viewed

	@@ -0,0 +1,146 @@

+#!/usr/bin/env python3
+"""
+Dump a cached .pt sample to JSON for manual debugging.
+Usage:
+    python scripts/dump_cache_sample.py                          # Dump first sample
+    python scripts/dump_cache_sample.py --index 5                # Dump sample at index 5
+    python scripts/dump_cache_sample.py --file data/cache/sample_ABC123.pt  # Dump specific file
+    python scripts/dump_cache_sample.py --output debug.json      # Custom output path
+"""
+import argparse
+import json
+import sys
+import os
+# Add project root to path so torch.load can find project modules when unpickling
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import torch
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+def convert_to_serializable(obj):
+    """Recursively convert non-JSON-serializable objects."""
+    if obj is None:
+        return None
+    if isinstance(obj, (str, int, float, bool)):
+        return obj
+    if isinstance(obj, (np.integer,)):
+        return int(obj)
+    if isinstance(obj, (np.floating,)):
+        return float(obj)
+    if isinstance(obj, np.ndarray):
+        return {"__type__": "ndarray", "shape": list(obj.shape), "dtype": str(obj.dtype), "data": obj.tolist()}
+    if isinstance(obj, torch.Tensor):
+        return {"__type__": "tensor", "shape": list(obj.shape), "dtype": str(obj.dtype), "data": obj.tolist()}
+    if isinstance(obj, datetime):
+        return {"__type__": "datetime", "value": obj.isoformat()}
+    if isinstance(obj, bytes):
+        return {"__type__": "bytes", "length": len(obj), "preview": obj[:100].hex() if len(obj) > 0 else ""}
+    if isinstance(obj, dict):
+        return {str(k): convert_to_serializable(v) for k, v in obj.items()}
+    if isinstance(obj, (list, tuple)):
+        return [convert_to_serializable(item) for item in obj]
+    if isinstance(obj, set):
+        return {"__type__": "set", "data": list(obj)}
+    # Fallback: try str representation
+    try:
+        return {"__type__": type(obj).__name__, "repr": str(obj)[:500]}
+    except:
+        return {"__type__": "unknown", "repr": "<not serializable>"}
+def main():
+    parser = argparse.ArgumentParser(description="Dump cached .pt sample to JSON")
+    parser.add_argument("--index", "-i", type=int, default=0, help="Index of sample to dump (default: 0)")
+    parser.add_argument("--file", "-f", type=str, default=None, help="Direct path to .pt file (overrides --index)")
+    parser.add_argument("--cache_dir", "-c", type=str, default="data/cache", help="Cache directory (default: data/cache)")
+    parser.add_argument("--output", "-o", type=str, default=None, help="Output JSON path (default: auto-generated)")
+    parser.add_argument("--compact", action="store_true", help="Compact JSON output (no indentation)")
+    args = parser.parse_args()
+    # Determine which file to load
+    if args.file:
+        filepath = Path(args.file)
+        if not filepath.exists():
+            print(f"ERROR: File not found: {filepath}")
+            return 1
+    else:
+        cache_dir = Path(args.cache_dir)
+        if not cache_dir.is_dir():
+            print(f"ERROR: Cache directory not found: {cache_dir}")
+            return 1
+        cached_files = sorted(cache_dir.glob("sample_*.pt"))
+        if not cached_files:
+            print(f"ERROR: No sample_*.pt files found in {cache_dir}")
+            return 1
+        if args.index >= len(cached_files):
+            print(f"ERROR: Index {args.index} out of range. Found {len(cached_files)} files.")
+            return 1
+        filepath = cached_files[args.index]
+    print(f"Loading: {filepath}")
+    # Load the .pt file
+    try:
+        data = torch.load(filepath, map_location="cpu", weights_only=False)
+    except Exception as e:
+        print(f"ERROR: Failed to load file: {e}")
+        return 1
+    # Convert to JSON-serializable format
+    print("Converting to JSON-serializable format...")
+    serializable_data = convert_to_serializable(data)
+    # Add metadata
+    output_data = {
+        "__metadata__": {
+            "source_file": str(filepath.absolute()),
+            "dumped_at": datetime.now().isoformat(),
+            "cache_mode": data.get("cache_mode", "unknown") if isinstance(data, dict) else "unknown"
+        },
+        "data": serializable_data
+    }
+    # Determine output path
+    if args.output:
+        output_path = Path(args.output)
+    else:
+        # Default: Save to current directory (root) instead of inside cache dir
+        output_path = Path.cwd() / filepath.with_suffix(".json").name
+    # Write JSON
+    print(f"Writing to: {output_path}")
+    indent = None if args.compact else 2
+    with open(output_path, "w") as f:
+        json.dump(output_data, f, indent=indent, ensure_ascii=False)
+    # Print summary
+    if isinstance(data, dict):
+        print(f"\n=== Summary ===")
+        print(f"Top-level keys: {list(data.keys())}")
+        print(f"Cache mode: {data.get('cache_mode', 'not specified')}")
+        if 'event_sequence' in data:
+            print(f"Event count: {len(data['event_sequence'])}")
+        if 'trades' in data:
+            print(f"Trade count: {len(data['trades'])}")
+        if 'source_token' in data:
+            print(f"Source token: {data['source_token']}")
+        if 'class_id' in data:
+            print(f"Class ID: {data['class_id']}")
+        if 'quality_score' in data:
+            print(f"Quality score: {data['quality_score']}")
+    print(f"\nDone! JSON saved to: {output_path}")
+    return 0
+if __name__ == "__main__":
+    exit(main())

train.py CHANGED Viewed

@@ -406,7 +406,7 @@ def main() -> None:
     hf_token = os.getenv("HF_TOKEN")
     if hf_token:
         print(f"Logging in to Hugging Face with token starting with: {hf_token[:4]}...")
-        huggingface_hub.login(token=hf_token)
     else:
         print("WARNING: HF_TOKEN not found in environment.")
@@ -437,7 +437,7 @@ def main() -> None:
     collator_encoder = CollatorEncoder(
         model_id=collator.model_id,
         dtype=init_dtype,
-        device="cpu"  # Collator runs on CPU to save VRAM
     )
     _set_worker_encoder(collator_encoder)
     logger.info("SigLIP encoder pre-loaded successfully.")

     hf_token = os.getenv("HF_TOKEN")
     if hf_token:
         print(f"Logging in to Hugging Face with token starting with: {hf_token[:4]}...")
+        pass  # huggingface_hub.login(token=hf_token)
     else:
         print("WARNING: HF_TOKEN not found in environment.")
     collator_encoder = CollatorEncoder(
         model_id=collator.model_id,
         dtype=init_dtype,
+        device="cuda"  # Use GPU for encoding (requires num_workers=0)
     )
     _set_worker_encoder(collator_encoder)
     logger.info("SigLIP encoder pre-loaded successfully.")

train.sh CHANGED Viewed

@@ -1,12 +1,12 @@
 accelerate launch train.py \
-  --epochs 10 \
   --batch_size 8 \
   --learning_rate 1e-4 \
   --warmup_ratio 0.1 \
   --grad_accum_steps 2 \
   --max_grad_norm 1.0 \
   --seed 42 \
-  --log_every 50 \
   --save_every 2000 \
   --tensorboard_dir runs/oracle \
   --checkpoint_dir checkpoints \
@@ -15,8 +15,8 @@ accelerate launch train.py \
   --horizons_seconds 30 60 120 240 420 \
   --quantiles 0.1 0.5 0.9 \
   --ohlc_stats_path ./data/ohlc_stats.npz \
-  --num_workers 4 \
   --pin_memory \
   --val_split 0.1 \
-  --val_every 2000 \
   "$@"

 accelerate launch train.py \
+  --epochs 1 \
   --batch_size 8 \
   --learning_rate 1e-4 \
   --warmup_ratio 0.1 \
   --grad_accum_steps 2 \
   --max_grad_norm 1.0 \
   --seed 42 \
+  --log_every 3 \
   --save_every 2000 \
   --tensorboard_dir runs/oracle \
   --checkpoint_dir checkpoints \
   --horizons_seconds 30 60 120 240 420 \
   --quantiles 0.1 0.5 0.9 \
   --ohlc_stats_path ./data/ohlc_stats.npz \
+  --num_workers 0 \
   --pin_memory \
   --val_split 0.1 \
+  --val_every 50 \
   "$@"