Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

data/data_loader.py +158 -95
data/ohlc_stats.npz +1 -1
sample_5eeBz7qY2sc8iGRc_0.json +0 -0

data/data_loader.py CHANGED Viewed

@@ -534,8 +534,12 @@ class OracleDataset(Dataset):
         wallet_data: Dict[str, Any],
         total_supply_dec: float,
         _register_event_fn,
-        cached_holders_list: List[List[str]] = None
     ) -> None:
         # Prepare helper sets and maps (static sniper set based on earliest buyers)
         all_buy_trades = sorted([e for e in trade_events if e.get('trade_direction') == 0 and e.get('success', False)], key=lambda x: x['timestamp'])
         sniper_wallets = []
@@ -571,32 +575,38 @@ class OracleDataset(Dataset):
             xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
             # Per-snapshot holder distribution at ts_value
-            holder_records_ts = []
-            holders_end = 0
-            if cached_holders_list is not None and i < len(cached_holders_list):
-                 # Use cached list of addresses
-                 holder_records_ts = [{'wallet_address': addr, 'current_balance': 0} for addr in cached_holders_list[i]]
-                 holders_end = len(cached_holders_list[i])
-            elif self.fetcher:
-                cutoff_dt_ts = datetime.datetime.fromtimestamp(ts_value, tz=datetime.timezone.utc)
-                holders_end, holder_records_ts = self.fetcher.fetch_holder_snapshot_stats_for_token(
-                    token_address,
-                    cutoff_dt_ts,
-                    limit=HOLDER_SNAPSHOT_TOP_K
                 )
-            else:
-                 holder_records_ts = []
-                 holders_end = 0
             holder_entries_ts = []
             for rec in holder_records_ts:
-                addr = rec.get('wallet_address')
-                try:
-                    bal = float(rec.get('current_balance', 0.0) or 0.0)
-                except (TypeError, ValueError):
-                    bal = 0.0
                 pct = (bal / total_supply_dec) if total_supply_dec and total_supply_dec > 0 else 0.0
                 if addr and pct > 0.0:
                     holder_entries_ts.append({'wallet': addr, 'holding_pct': pct})
             holder_entries_ts.sort(key=lambda d: d['holding_pct'], reverse=True)
             # Emit HolderSnapshot for this ts_value
@@ -1346,18 +1356,21 @@ class OracleDataset(Dataset):
         elapsed = (T_cutoff - mint_timestamp).total_seconds()
         snap_idx = int(elapsed // 300)
         holder_records = []
-        cached_holders_list = raw_data.get('holder_snapshots_list', [])
-        if 0 <= snap_idx < len(cached_holders_list):
-            # Handle both old format (list of addresses) and new format (dict with holders)
-            snapshot_data = cached_holders_list[snap_idx]
-            if isinstance(snapshot_data, dict):
-                # New format: {'timestamp': int, 'holders': [...]}
-                holder_records = [{'wallet_address': h.get('wallet_address')} for h in snapshot_data.get('holders', [])]
-            else:
-                # Old format: list of addresses
-                holder_records = [{'wallet_address': addr} for addr in snapshot_data]
         for holder in holder_records:
-            _add_wallet(holder.get('wallet_address'), wallets_to_fetch)
         _timings['wallet_collection'] = _time.perf_counter() - _t0
         _timings['num_wallets'] = len(wallets_to_fetch)
@@ -1877,7 +1890,7 @@ class OracleDataset(Dataset):
             future_trades_for_labels: List[Dict[str, Any]],
             pooler: EmbeddingPooler,
             sample_idx: Optional[int] = None,
-            cached_holders_list: List[List[str]] = None,
             cached_ohlc_1s: Optional[torch.Tensor] = None,
             quality_score: Optional[float] = None
         ) -> Optional[Dict[str, Any]]:
@@ -2087,7 +2100,7 @@ class OracleDataset(Dataset):
         HIGH_DEF_INTERVAL = ("1s", 1)
         MIDDLE_INTERVAL = ("30s", 30)
-        def _emit_chart_segments(trades: List[Dict[str, Any]], interval: tuple, signature_prefix: str, precomputed_ohlc: List[tuple] = None):
             if not trades and precomputed_ohlc is None:
                 return []
             interval_label, interval_seconds = interval
@@ -2114,12 +2127,12 @@ class OracleDataset(Dataset):
                     'i': interval_label
                 }
                 emitted_events.append(chart_event)
-                _register_event(chart_event, _event_execution_sort_key(last_ts, signature=f"{signature_prefix}-{idx}"))
             return emitted_events
-        # Emit charts
-        chart_events = []
         # Prepare 1s OHLC from cache if available
         ohlc_1s_precomputed = None
         if cached_ohlc_1s is not None and len(cached_ohlc_1s) > 0:
@@ -2134,8 +2147,8 @@ class OracleDataset(Dataset):
                      for i, row in enumerate(slice_tensor)
                  ]
-        chart_events.extend(_emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL, "chart-hd", precomputed_ohlc=ohlc_1s_precomputed))
-        chart_events.extend(_emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL, "chart-mid"))
         # 5. Process Other Records (Pool, Liquidity, Fees, Burns, Locks, Migrations)
         pool_meta_by_address = {}
@@ -2331,7 +2344,20 @@ class OracleDataset(Dataset):
             _register_event,
             cached_holders_list=cached_holders_list
         )
         # 7. Finalize Sequence with Dynamic Sampling
         event_sequence_entries.sort(key=lambda x: x[0])
         raw_event_sequence = [entry[1] for entry in event_sequence_entries]
@@ -2628,7 +2654,7 @@ class OracleDataset(Dataset):
         raw_data['ohlc_1s'] = ohlc_1s
-        # Generate holder snapshots
         interval = 300
         num_intervals = (duration_seconds // interval) + 1
         snapshot_stats = torch.zeros((num_intervals, 6), dtype=torch.float32)
@@ -2640,69 +2666,96 @@ class OracleDataset(Dataset):
             if bucket_idx >= 0:
                 buckets[bucket_idx].append(t)
-        # Batch-fetch ALL holder counts in ONE query instead of N per-interval queries
-        # Original code did N sequential fetch_holder_snapshot_stats_for_token() calls
-        # which each execute a window function query. This batch replaces them all.
-        holder_counts_by_interval = {}
-        try:
-            max_snapshot_ts = t0 + datetime.timedelta(seconds=num_intervals * interval)
-            batch_holder_query = """
-                SELECT
-                    intDiv(toUInt64(%(t0_epoch)s) + 300 * number, 1) as snap_epoch,
-                    number as interval_idx
-                FROM numbers(%(num_intervals)s)
-            """
-            # Use a simpler approach: query holder counts for each interval boundary
-            # by fetching ALL holdings for this token once, then bucketing in Python
-            all_holdings = self.fetcher.db_client.execute("""
-                SELECT wallet_address, current_balance, updated_at
-                FROM wallet_holdings
-                WHERE mint_address = %(token)s
-                ORDER BY wallet_address, updated_at
-            """, {'token': token_address})
-            if all_holdings:
-                # Build point-in-time holder counts per interval
-                # For each interval, count wallets with balance > 0 as of that timestamp
-                from collections import defaultdict as _dd
-                wallet_latest = {}  # wallet -> (balance, updated_at)
-                # Sort by updated_at
-                all_holdings.sort(key=lambda x: x[2])
-                # Process holdings chronologically and snapshot at each interval boundary
-                holding_idx = 0
-                for i in range(num_intervals):
-                    snap_ts = t0 + datetime.timedelta(seconds=(i + 1) * interval)
-                    # Apply all holdings up to this timestamp
-                    while holding_idx < len(all_holdings) and all_holdings[holding_idx][2] <= snap_ts:
-                        wallet, balance, _ = all_holdings[holding_idx]
-                        wallet_latest[wallet] = balance
-                        holding_idx += 1
-                    # Count wallets with positive balance
-                    holder_counts_by_interval[i] = sum(1 for b in wallet_latest.values() if b and b > 0)
-        except Exception as e:
-            pass  # Fall back to 0 counts
         holder_snapshots_list = []
         for i in range(num_intervals):
             bucket_trades = buckets[i]
             vol = sum(t.get('total_usd', 0.0) for t in bucket_trades)
             tx = len(bucket_trades)
             buys = sum(1 for t in bucket_trades if t.get('trade_direction') == 0 or t.get('trade_type') == 0)
             sells = tx - buys
-            count = holder_counts_by_interval.get(i, 0)
             snapshot_stats[i, 0] = float(vol)
             snapshot_stats[i, 1] = float(tx)
             snapshot_stats[i, 2] = float(buys)
             snapshot_stats[i, 3] = float(sells)
             snapshot_stats[i, 4] = float(count)
-            snapshot_stats[i, 5] = 0.0  # top10_pct not available in batch mode
             snapshot_ts = t0 + datetime.timedelta(seconds=(i+1)*interval)
             holder_snapshots_list.append({
                 'timestamp': int(snapshot_ts.timestamp()),
-                'holders': []
             })
         raw_data['snapshots_5m'] = snapshot_stats
@@ -2728,9 +2781,12 @@ class OracleDataset(Dataset):
             if liq.get('lp_provider'):
                 all_wallets.add(liq['lp_provider'])
         for snapshot in holder_snapshots_list:
-            for holder in snapshot.get('holders', []):
-                if holder.get('wallet_address'):
-                    all_wallets.add(holder['wallet_address'])
         all_wallets.discard(None)
         all_wallets.discard('')
@@ -2826,11 +2882,18 @@ class OracleDataset(Dataset):
             # Get holder snapshot at T_cutoff
             elapsed = (T_cutoff - t0).total_seconds()
             snap_idx = int(elapsed // 300)
-            if 0 <= snap_idx < len(holder_snapshots_list):
-                snapshot_data = holder_snapshots_list[snap_idx]
-                for holder in snapshot_data.get('holders', []):
-                    if holder.get('wallet_address'):
-                        wallets_to_fetch.add(holder['wallet_address'])
             wallets_to_fetch.discard(None)
             wallets_to_fetch.discard('')
@@ -2896,7 +2959,7 @@ class OracleDataset(Dataset):
                 future_trades_for_labels=raw_data['trades'],
                 pooler=pooler,
                 sample_idx=idx,
-                cached_holders_list=None,  # Force DB fetch for holder snapshots
                 cached_ohlc_1s=ohlc_1s,
                 quality_score=None  # Will be injected by cache_dataset.py
             )

         wallet_data: Dict[str, Any],
         total_supply_dec: float,
         _register_event_fn,
+        cached_holders_list: List[Dict[str, Any]] = None
     ) -> None:
+        if cached_holders_list is None:
+            raise RuntimeError(
+                f"Missing holder_snapshots_list for token {token_address} in _generate_onchain_snapshots."
+            )
         # Prepare helper sets and maps (static sniper set based on earliest buyers)
         all_buy_trades = sorted([e for e in trade_events if e.get('trade_direction') == 0 and e.get('success', False)], key=lambda x: x['timestamp'])
         sniper_wallets = []
             xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
             # Per-snapshot holder distribution at ts_value
+            if i >= len(cached_holders_list):
+                raise RuntimeError(
+                    f"holder_snapshots_list too short for token {token_address}: need index {i}, len={len(cached_holders_list)}."
                 )
+            snapshot_data = cached_holders_list[i]
+            if not isinstance(snapshot_data, dict):
+                raise RuntimeError(
+                    f"Invalid holder snapshot entry type for token {token_address} at index {i}: {type(snapshot_data).__name__}."
+                )
+            if 'holders' not in snapshot_data or not isinstance(snapshot_data['holders'], list):
+                raise RuntimeError(
+                    f"Malformed holder snapshot for token {token_address} at index {i}: missing list 'holders'."
+                )
+            holder_records_ts = snapshot_data['holders']
+            holders_end = 0
             holder_entries_ts = []
             for rec in holder_records_ts:
+                if not isinstance(rec, dict):
+                    raise RuntimeError(
+                        f"Malformed holder record for token {token_address} at index {i}: expected dict."
+                    )
+                if 'wallet_address' not in rec or 'current_balance' not in rec:
+                    raise RuntimeError(
+                        f"Malformed holder record for token {token_address} at index {i}: requires wallet_address/current_balance."
+                    )
+                addr = rec['wallet_address']
+                bal = float(rec['current_balance'])
                 pct = (bal / total_supply_dec) if total_supply_dec and total_supply_dec > 0 else 0.0
                 if addr and pct > 0.0:
                     holder_entries_ts.append({'wallet': addr, 'holding_pct': pct})
+                    holders_end += 1
             holder_entries_ts.sort(key=lambda d: d['holding_pct'], reverse=True)
             # Emit HolderSnapshot for this ts_value
         elapsed = (T_cutoff - mint_timestamp).total_seconds()
         snap_idx = int(elapsed // 300)
         holder_records = []
+        cached_holders_list = raw_data.get('holder_snapshots_list')
+        if not isinstance(cached_holders_list, list):
+            raise RuntimeError("Invalid cache: holder_snapshots_list must be a list.")
+        if not (0 <= snap_idx < len(cached_holders_list)):
+            raise RuntimeError(
+                f"Invalid cache: holder_snapshots_list index out of range (snap_idx={snap_idx}, len={len(cached_holders_list)})."
+            )
+        snapshot_data = cached_holders_list[snap_idx]
+        if not isinstance(snapshot_data, dict) or not isinstance(snapshot_data.get('holders'), list):
+            raise RuntimeError("Invalid cache: holder snapshot entry must be a dict with list field 'holders'.")
+        holder_records = snapshot_data['holders']
         for holder in holder_records:
+            if not isinstance(holder, dict) or 'wallet_address' not in holder or 'current_balance' not in holder:
+                raise RuntimeError("Invalid cache: each holder record must include wallet_address and current_balance.")
+            _add_wallet(holder['wallet_address'], wallets_to_fetch)
         _timings['wallet_collection'] = _time.perf_counter() - _t0
         _timings['num_wallets'] = len(wallets_to_fetch)
             future_trades_for_labels: List[Dict[str, Any]],
             pooler: EmbeddingPooler,
             sample_idx: Optional[int] = None,
+            cached_holders_list: List[Dict[str, Any]] = None,
             cached_ohlc_1s: Optional[torch.Tensor] = None,
             quality_score: Optional[float] = None
         ) -> Optional[Dict[str, Any]]:
         HIGH_DEF_INTERVAL = ("1s", 1)
         MIDDLE_INTERVAL = ("30s", 30)
+        def _emit_chart_segments(trades: List[Dict[str, Any]], interval: tuple, precomputed_ohlc: List[tuple] = None):
             if not trades and precomputed_ohlc is None:
                 return []
             interval_label, interval_seconds = interval
                     'i': interval_label
                 }
                 emitted_events.append(chart_event)
             return emitted_events
+        # Build chart candidates (registration deferred until we choose exactly one interval mode)
+        chart_events_1s = []
+        chart_events_30s = []
         # Prepare 1s OHLC from cache if available
         ohlc_1s_precomputed = None
         if cached_ohlc_1s is not None and len(cached_ohlc_1s) > 0:
                      for i, row in enumerate(slice_tensor)
                  ]
+        chart_events_1s = _emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL, precomputed_ohlc=ohlc_1s_precomputed)
+        chart_events_30s = _emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL)
         # 5. Process Other Records (Pool, Liquidity, Fees, Burns, Locks, Migrations)
         pool_meta_by_address = {}
             _register_event,
             cached_holders_list=cached_holders_list
         )
+        # Choose exactly one chart resolution per sample:
+        # - no pressure -> 1s
+        # - pressure -> 30s
+        non_chart_event_count = len(event_sequence_entries)
+        would_exceed = (non_chart_event_count + len(chart_events_1s)) > self.max_seq_len
+        selected_chart_events = chart_events_30s if would_exceed else chart_events_1s
+        selected_chart_signature = "chart-mid" if would_exceed else "chart-hd"
+        for chart_idx, chart_event in enumerate(selected_chart_events):
+            _register_event(
+                chart_event,
+                _event_execution_sort_key(chart_event['timestamp'], signature=f"{selected_chart_signature}-{chart_idx}")
+            )
         # 7. Finalize Sequence with Dynamic Sampling
         event_sequence_entries.sort(key=lambda x: x[0])
         raw_event_sequence = [entry[1] for entry in event_sequence_entries]
         raw_data['ohlc_1s'] = ohlc_1s
+        # Generate holder snapshots from deterministic trade-ledger reconstruction.
         interval = 300
         num_intervals = (duration_seconds // interval) + 1
         snapshot_stats = torch.zeros((num_intervals, 6), dtype=torch.float32)
             if bucket_idx >= 0:
                 buckets[bucket_idx].append(t)
+        raw_total_supply = raw_data.get('total_supply')
+        raw_decimals = raw_data.get('decimals')
+        if raw_total_supply is None or raw_decimals is None:
+            raise RuntimeError("Missing token total_supply/decimals required for holder snapshot reconstruction.")
+        total_supply_raw = int(raw_total_supply)
+        token_decimals = int(raw_decimals)
+        if total_supply_raw <= 0:
+            raise RuntimeError(f"Invalid total_supply for {token_address}: {total_supply_raw}")
+        if token_decimals < 0:
+            raise RuntimeError(f"Invalid decimals for {token_address}: {token_decimals}")
+        token_scale = 10 ** token_decimals
+        def _strict_int(v: Any, field_name: str) -> int:
+            if v is None:
+                raise RuntimeError(f"Missing {field_name} in trade record for {token_address}.")
+            try:
+                return int(v)
+            except Exception as e:
+                raise RuntimeError(f"Invalid {field_name} in trade record for {token_address}: {v}") from e
+        def _trade_sort_key_for_ledger(trade: Dict[str, Any]) -> tuple:
+            return (
+                _timestamp_to_order_value(trade.get('timestamp')),
+                _strict_int(trade.get('slot', 0), 'slot'),
+                _strict_int(trade.get('transaction_index', 0), 'transaction_index'),
+                _strict_int(trade.get('instruction_index', 0), 'instruction_index'),
+                str(trade.get('signature') or '')
+            )
+        ledger_trades = []
+        for trade in trades:
+            if not trade.get('success', False):
+                continue
+            maker = trade.get('maker')
+            if not maker:
+                raise RuntimeError(f"Missing maker in successful trade for {token_address}.")
+            trade_type = _strict_int(trade.get('trade_type'), 'trade_type')
+            if trade_type not in (0, 1):
+                raise RuntimeError(f"Invalid trade_type={trade_type} for {token_address}; expected 0/1.")
+            base_amount_raw = _strict_int(trade.get('base_amount'), 'base_amount')
+            if base_amount_raw < 0:
+                raise RuntimeError(f"Invalid negative base_amount={base_amount_raw} for {token_address}.")
+            ledger_trades.append((trade, maker, trade_type, base_amount_raw))
+        ledger_trades.sort(key=lambda x: _trade_sort_key_for_ledger(x[0]))
+        wallet_balances_raw: Dict[str, int] = {}
+        ledger_idx = 0
         holder_snapshots_list = []
         for i in range(num_intervals):
             bucket_trades = buckets[i]
             vol = sum(t.get('total_usd', 0.0) for t in bucket_trades)
             tx = len(bucket_trades)
             buys = sum(1 for t in bucket_trades if t.get('trade_direction') == 0 or t.get('trade_type') == 0)
             sells = tx - buys
+            snapshot_ts_epoch = t0_val + ((i + 1) * interval)
+            while ledger_idx < len(ledger_trades):
+                trade, maker, trade_type, base_amount_raw = ledger_trades[ledger_idx]
+                trade_ts = _timestamp_to_order_value(trade.get('timestamp'))
+                if trade_ts > snapshot_ts_epoch:
+                    break
+                signed_delta = base_amount_raw if trade_type == 0 else -base_amount_raw
+                wallet_balances_raw[maker] = wallet_balances_raw.get(maker, 0) + signed_delta
+                ledger_idx += 1
+            positive_holders_raw = [(wallet, bal) for wallet, bal in wallet_balances_raw.items() if bal > 0]
+            positive_holders_raw.sort(key=lambda item: (-item[1], item[0]))
+            holders_topk_raw = positive_holders_raw[:HOLDER_SNAPSHOT_TOP_K]
+            count = len(positive_holders_raw)
+            top10_sum_raw = sum(bal for _, bal in positive_holders_raw[:10])
+            top10_pct = float(top10_sum_raw) / float(total_supply_raw)
             snapshot_stats[i, 0] = float(vol)
             snapshot_stats[i, 1] = float(tx)
             snapshot_stats[i, 2] = float(buys)
             snapshot_stats[i, 3] = float(sells)
             snapshot_stats[i, 4] = float(count)
+            snapshot_stats[i, 5] = float(top10_pct)
             snapshot_ts = t0 + datetime.timedelta(seconds=(i+1)*interval)
             holder_snapshots_list.append({
                 'timestamp': int(snapshot_ts.timestamp()),
+                'holders': [
+                    {
+                        'wallet_address': wallet,
+                        'current_balance': float(balance_raw) / float(token_scale)
+                    }
+                    for wallet, balance_raw in holders_topk_raw
+                ]
             })
         raw_data['snapshots_5m'] = snapshot_stats
             if liq.get('lp_provider'):
                 all_wallets.add(liq['lp_provider'])
         for snapshot in holder_snapshots_list:
+            if not isinstance(snapshot, dict) or not isinstance(snapshot.get('holders'), list):
+                raise RuntimeError("Invalid holder_snapshots_list entry during wallet collection.")
+            for holder in snapshot['holders']:
+                if not isinstance(holder, dict) or 'wallet_address' not in holder or 'current_balance' not in holder:
+                    raise RuntimeError("Invalid holder record during wallet collection.")
+                all_wallets.add(holder['wallet_address'])
         all_wallets.discard(None)
         all_wallets.discard('')
             # Get holder snapshot at T_cutoff
             elapsed = (T_cutoff - t0).total_seconds()
             snap_idx = int(elapsed // 300)
+            if not (0 <= snap_idx < len(holder_snapshots_list)):
+                raise RuntimeError(
+                    f"holder_snapshots_list index out of range in __cacheitem_context__ "
+                    f"(snap_idx={snap_idx}, len={len(holder_snapshots_list)})."
+                )
+            snapshot_data = holder_snapshots_list[snap_idx]
+            if not isinstance(snapshot_data, dict) or not isinstance(snapshot_data.get('holders'), list):
+                raise RuntimeError("Invalid holder snapshot entry in __cacheitem_context__.")
+            for holder in snapshot_data['holders']:
+                if not isinstance(holder, dict) or 'wallet_address' not in holder or 'current_balance' not in holder:
+                    raise RuntimeError("Invalid holder record in __cacheitem_context__.")
+                wallets_to_fetch.add(holder['wallet_address'])
             wallets_to_fetch.discard(None)
             wallets_to_fetch.discard('')
                 future_trades_for_labels=raw_data['trades'],
                 pooler=pooler,
                 sample_idx=idx,
+                cached_holders_list=holder_snapshots_list,
                 cached_ohlc_1s=ohlc_1s,
                 quality_score=None  # Will be injected by cache_dataset.py
             )

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1d6de1f1622fbf061842fa4227e2b98784c9cec39ed647f4b87df2ad5eef6e47
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:f6a84b63ec605e83a655f404bc89d825aa8ffbb5ac3ea24c7d2197324646d016
 size 1660

sample_5eeBz7qY2sc8iGRc_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff