Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

data/data_loader.py +76 -64
data/ohlc_stats.npz +1 -1
ingest.sh +2 -2
log.log +2 -2
sample_3nUWyakgm159j1vT_0.json +0 -0
test_loader.py +7 -0

data/data_loader.py CHANGED Viewed

@@ -571,7 +571,7 @@ class OracleDataset(Dataset):
         buyers_seen_global = set()
         prev_holders_count = 0
-        for snapshot_data in cached_holders_list:
             if not isinstance(snapshot_data, dict):
                 continue
             ts_value = snapshot_data.get('timestamp')
@@ -582,6 +582,10 @@ class OracleDataset(Dataset):
             trades_win = [e for e in trade_events if e.get('success', False) and window_start < e['timestamp'] <= ts_value]
             xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
             if 'holders' not in snapshot_data or not isinstance(snapshot_data['holders'], list):
                 continue
@@ -612,7 +616,11 @@ class OracleDataset(Dataset):
                 'relative_ts': ts_value - t0_timestamp,
                 'holders': holder_entries_ts
             }
-            _register_event_fn(hs_event, self._event_execution_sort_key(ts_value, signature='HolderSnapshot') if hasattr(self, '_event_execution_sort_key') else (ts_value, 0, 0, 0, 'HolderSnapshot'))
             holder_pct_map_ts = {d['wallet']: d['holding_pct'] for d in holder_entries_ts}
             top10_holder_pct = sum(d['holding_pct'] for d in holder_entries_ts[:10]) if holder_entries_ts else 0.0
@@ -678,7 +686,11 @@ class OracleDataset(Dataset):
                 'total_txns': float(total_txns),
                 'global_fees_paid': float(global_fees_paid)
             }
-            _register_event_fn(oc_event, self._event_execution_sort_key(ts_value, signature='OnChain_Snapshot') if hasattr(self, '_event_execution_sort_key') else (ts_value, 0, 0, 0, 'OnChain_Snapshot'))
     def _calculate_deployed_token_stats(self, profiles: Dict[str, Dict[str, Any]], T_cutoff: datetime.datetime):
         """
@@ -1132,27 +1144,13 @@ class OracleDataset(Dataset):
         end_ts = int(T_cutoff.timestamp())
-        # Find the first available price for forward-filling
-        # Use the earliest trade price from intervals >= start_ts
-        last_price = None
         for interval_ts in sorted_intervals:
-            if interval_ts >= start_ts:
-                last_price = trades_by_interval[interval_ts][0]
-                break
-        # Fallback to first trade price if no intervals after start_ts
-        if last_price is None:
-            last_price = aggregation_trades[0]['price_usd']
-        for ts in range(start_ts, end_ts + 1, interval_seconds):
-            if ts in trades_by_interval:
-                prices = trades_by_interval[ts]
                 open_price = prices[0]
                 close_price = prices[-1]
-                full_ohlc.append((ts, open_price, close_price))
-                last_price = close_price
-            else:
-                full_ohlc.append((ts, last_price, last_price))
         return full_ohlc
     def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
@@ -2151,8 +2149,8 @@ class OracleDataset(Dataset):
                 closes_raw = [s[2] for s in segment]
                 chart_event = {
                     'event_type': 'Chart_Segment',
-                    'timestamp': last_ts,
-                    'relative_ts': last_ts - t0_timestamp,
                     'opens': self._normalize_price_series(opens_raw),
                     'closes': self._normalize_price_series(closes_raw),
                     'i': interval_label
@@ -2164,21 +2162,9 @@ class OracleDataset(Dataset):
         chart_events_1s = []
         chart_events_30s = []
-        # Prepare 1s OHLC from cache if available
-        ohlc_1s_precomputed = None
-        if cached_ohlc_1s is not None and len(cached_ohlc_1s) > 0:
-             # Calculate limit based on T_cutoff relative to t0
-             duration_limit = int(T_cutoff.timestamp() - t0_timestamp) + 1
-             limit = min(duration_limit, len(cached_ohlc_1s))
-             if limit > 0:
-                 slice_tensor = cached_ohlc_1s[:limit]
-                 ohlc_1s_precomputed = [
-                     (t0_timestamp + i, float(row[0]), float(row[1]))
-                     for i, row in enumerate(slice_tensor)
-                 ]
-        chart_events_1s = _emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL, precomputed_ohlc=ohlc_1s_precomputed)
         chart_events_30s = _emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL)
         # 5. Process Other Records (Pool, Liquidity, Fees, Burns, Locks, Migrations)
@@ -2364,6 +2350,53 @@ class OracleDataset(Dataset):
                 )
             )
         # 6. Generate Snapshots
         self._generate_onchain_snapshots(
             token_address, int(t0_timestamp), T_cutoff,
@@ -2373,7 +2406,7 @@ class OracleDataset(Dataset):
             wallet_data,
             total_supply_dec,
             _register_event,
-            cached_holders_list=cached_holders_list
         )
         # Choose exactly one chart resolution per sample:
@@ -2733,31 +2766,10 @@ class OracleDataset(Dataset):
         t0_val = _timestamp_to_order_value(t0)
         last_trade_ts_val = max(trade_ts_values)
         duration_seconds = int(last_trade_ts_val - t0_val) + 120
-        ohlc_1s = torch.zeros((duration_seconds, 2), dtype=torch.float32)
-        trades.sort(key=lambda x: _timestamp_to_order_value(x['timestamp']))
-        trades_by_sec = defaultdict(list)
-        for t in trades:
-            if not t.get('success', False) or float(t.get('price_usd', 0.0) or 0.0) <= 0:
-                continue
-            ts = _timestamp_to_order_value(t['timestamp'])
-            sec_idx = int(ts - t0_val)
-            if 0 <= sec_idx < duration_seconds:
-                trades_by_sec[sec_idx].append(t['price_usd'])
-        last_close = float(trades[0]['price_usd'])
-        for i in range(duration_seconds):
-            if i in trades_by_sec:
-                prices = trades_by_sec[i]
-                op, cl = prices[0], prices[-1]
-                last_close = cl
-            else:
-                op = cl = last_close
-            ohlc_1s[i, 0] = float(op)
-            ohlc_1s[i, 1] = float(cl)
-        raw_data['ohlc_1s'] = ohlc_1s
         # Generate holder snapshots from deterministic trade-ledger reconstruction.
         interval = 300
@@ -3065,7 +3077,7 @@ class OracleDataset(Dataset):
                 pooler=pooler,
                 sample_idx=idx,
                 cached_holders_list=holder_snapshots_list,
-                cached_ohlc_1s=ohlc_1s,
                 quality_score=None  # Will be injected by cache_dataset.py
             )

         buyers_seen_global = set()
         prev_holders_count = 0
+        for i, snapshot_data in enumerate(cached_holders_list):
             if not isinstance(snapshot_data, dict):
                 continue
             ts_value = snapshot_data.get('timestamp')
             trades_win = [e for e in trade_events if e.get('success', False) and window_start < e['timestamp'] <= ts_value]
             xfers_win = [e for e in transfer_events if window_start < e['timestamp'] <= ts_value]
+            # SPARSE SNAPSHOTS: Skip if absolutely nothing happened in this 5 minute window
+            if not trades_win and not xfers_win:
+                continue
             if 'holders' not in snapshot_data or not isinstance(snapshot_data['holders'], list):
                 continue
                 'relative_ts': ts_value - t0_timestamp,
                 'holders': holder_entries_ts
             }
+            _register_event_fn(
+                hs_event,
+                self._event_execution_sort_key(ts_value, slot=10**12, transaction_index=10**9, signature='HolderSnapshot')
+                if hasattr(self, '_event_execution_sort_key') else (ts_value, 10**12, 10**9, 0, 'HolderSnapshot')
+            )
             holder_pct_map_ts = {d['wallet']: d['holding_pct'] for d in holder_entries_ts}
             top10_holder_pct = sum(d['holding_pct'] for d in holder_entries_ts[:10]) if holder_entries_ts else 0.0
                 'total_txns': float(total_txns),
                 'global_fees_paid': float(global_fees_paid)
             }
+            _register_event_fn(
+                oc_event,
+                self._event_execution_sort_key(ts_value, slot=10**12, transaction_index=10**9, signature='OnChain_Snapshot')
+                if hasattr(self, '_event_execution_sort_key') else (ts_value, 10**12, 10**9, 0, 'OnChain_Snapshot')
+            )
     def _calculate_deployed_token_stats(self, profiles: Dict[str, Dict[str, Any]], T_cutoff: datetime.datetime):
         """
         end_ts = int(T_cutoff.timestamp())
         for interval_ts in sorted_intervals:
+            if start_ts <= interval_ts <= end_ts:
+                prices = trades_by_interval[interval_ts]
                 open_price = prices[0]
                 close_price = prices[-1]
+                full_ohlc.append((interval_ts, open_price, close_price))
         return full_ohlc
     def __getitem__(self, idx: int) -> Optional[Dict[str, Any]]:
                 closes_raw = [s[2] for s in segment]
                 chart_event = {
                     'event_type': 'Chart_Segment',
+                    'timestamp': int(last_ts),
+                    'relative_ts': int(last_ts) - int(t0_timestamp),
                     'opens': self._normalize_price_series(opens_raw),
                     'closes': self._normalize_price_series(closes_raw),
                     'i': interval_label
         chart_events_1s = []
         chart_events_30s = []
+        # Build chart candidates (registration deferred until we choose exactly one interval mode)
+        # We process sparse native charts using _generate_ohlc for both 1s and 30s
+        chart_events_1s = _emit_chart_segments(high_def_chart_trades, HIGH_DEF_INTERVAL)
         chart_events_30s = _emit_chart_segments(middle_chart_trades, MIDDLE_INTERVAL)
         # 5. Process Other Records (Pool, Liquidity, Fees, Burns, Locks, Migrations)
                 )
             )
+        # --- ADD DYNAMIC T_CUTOFF SNAPSHOT ---
+        # Evaluate balances exactly up to T_cutoff using the filtered trade_records
+        wallet_balances_raw = {}
+        for trade in trade_records:
+            if not trade.get('success', False):
+                continue
+            maker = trade.get('maker')
+            if not maker:
+                continue
+            try:
+                trade_type = int(trade.get('trade_type', 0))
+                base_amount_raw = int(trade.get('base_amount', 0))
+            except:
+                continue
+            if trade_type not in (0, 1) or base_amount_raw < 0:
+                continue
+            signed_delta = base_amount_raw if trade_type == 0 else -base_amount_raw
+            wallet_balances_raw[maker] = wallet_balances_raw.get(maker, 0) + signed_delta
+        positive_holders_raw = [(w, b) for w, b in wallet_balances_raw.items() if b > 0]
+        positive_holders_raw.sort(key=lambda item: (-item[1], item[0]))
+        holders_topk_raw = positive_holders_raw[:HOLDER_SNAPSHOT_TOP_K]
+        cutoff_ts_epoch = int(T_cutoff.timestamp())
+        token_scale = 10 ** base_decimals if base_decimals else 1
+        cutoff_snapshot = {
+            'timestamp': cutoff_ts_epoch,
+            'holders': [
+                {
+                    'wallet_address': w,
+                    'current_balance': float(b) / float(token_scale)
+                }
+                for w, b in holders_topk_raw
+            ]
+        }
+        # Create a local copy of cached_holders_list up to T_cutoff
+        local_holders_list = [
+            snap for snap in (cached_holders_list or [])
+            if snap.get('timestamp', 0) < cutoff_ts_epoch
+        ]
+        # Append our precise T_cutoff snapshot at the end
+        if not local_holders_list or local_holders_list[-1]['timestamp'] != cutoff_ts_epoch:
+            local_holders_list.append(cutoff_snapshot)
         # 6. Generate Snapshots
         self._generate_onchain_snapshots(
             token_address, int(t0_timestamp), T_cutoff,
             wallet_data,
             total_supply_dec,
             _register_event,
+            cached_holders_list=local_holders_list
         )
         # Choose exactly one chart resolution per sample:
         t0_val = _timestamp_to_order_value(t0)
         last_trade_ts_val = max(trade_ts_values)
+        # Disable dense OHLC 1s precomputation.
+        # Chart_Segment will now generate sparse OHLC at runtime.
         duration_seconds = int(last_trade_ts_val - t0_val) + 120
+        raw_data['ohlc_1s'] = None
         # Generate holder snapshots from deterministic trade-ledger reconstruction.
         interval = 300
                 pooler=pooler,
                 sample_idx=idx,
                 cached_holders_list=holder_snapshots_list,
+                cached_ohlc_1s=None,
                 quality_score=None  # Will be injected by cache_dataset.py
             )

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:720bc76c8bfa9f5c1dbb0b2a3a575b8226c761160136c4b108c08ae6cbb6c299
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:87f6e823bed45b3e399d6fe2ab46f3297d80a623e5a41cca785a60c5a7db067d
 size 1660

ingest.sh CHANGED Viewed

@@ -20,7 +20,7 @@ error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
 #===============================================================================
 header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
-EPOCHS=(844 845 846 847 848 849)
 log "Processing epochs one at a time to minimize disk usage..."
@@ -41,7 +41,7 @@ for epoch in "${EPOCHS[@]}"; do
     # Step 2: Ingest (always pass --merge-neo4j; auto-detect handles empty DB)
     log "  [2/3] Ingesting epoch ${epoch} into databases..."
-    python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j -c || {
         error "Ingestion failed for epoch ${epoch}. Cannot continue."
     }

 #===============================================================================
 header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
+EPOCHS=(844)
 log "Processing epochs one at a time to minimize disk usage..."
     # Step 2: Ingest (always pass --merge-neo4j; auto-detect handles empty DB)
     log "  [2/3] Ingesting epoch ${epoch} into databases..."
+    python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
         error "Ingestion failed for epoch ${epoch}. Cannot continue."
     }

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27af4460defe3832c7bb709538efa4908bd370b202bad245473b23b14648e2db
-size 2881

 version https://git-lfs.github.com/spec/v1
+oid sha256:c605ecab2de1c8c8442dda85ada5345b9d6ba43aae4095130f1d92ce6261c127
+size 44400

sample_3nUWyakgm159j1vT_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

test_loader.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import datetime
+from data.data_loader import OracleDataset
+from data.data_fetcher import DataFetcher
+from clickhouse_driver import Client as ClickHouseClient
+from neo4j import GraphDatabase
+# Since there are dependencies and db clients needed, we can just run the pre-existing cache script on a small sample to test