Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

.gitignore +1 -1
.ipynb_checkpoints/ingest-checkpoint.sh +67 -0
.ipynb_checkpoints/pre_cache-checkpoint.sh +42 -0
data/data_loader.py +85 -18
data/ohlc_stats.npz +1 -1
log.log +2 -2
pre_cache.sh +24 -12
sample_36zVkFUXEhdKW5Nz_0.json +0 -0
scripts/cache_dataset.py +0 -2

.gitignore CHANGED Viewed

@@ -16,4 +16,4 @@ checkpoints/
 metadata/
 store/
 preprocessed_configs/
-.early.coverage

 metadata/
 store/
 preprocessed_configs/
+.early.coverage

.ipynb_checkpoints/ingest-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/bin/bash
+set -e
+trap 'read -p "Press Enter to exit..."' EXIT
+# Colors
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+BLUE='\033[0;34m'
+CYAN='\033[0;36m'
+NC='\033[0m'
+# Helper functions
+header() { echo -e "\n${CYAN}========================================${NC}\n${CYAN}  $1${NC}\n${CYAN}========================================${NC}\n"; }
+log() { echo -e "${BLUE}[INFO]${NC} $1"; }
+success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
+error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
+#===============================================================================
+# Step 5+6: Download, Ingest, Delete (one epoch at a time to save disk)
+#===============================================================================
+header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
+EPOCHS=(844 845 846)
+log "Processing epochs one at a time to minimize disk usage..."
+log "Each epoch: ~20GB download → ingest → delete"
+echo ""
+for epoch in "${EPOCHS[@]}"; do
+    EPOCH_DIR="./data/pump_fun/epoch_${epoch}"
+    log "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+    log "Processing epoch ${epoch}..."
+    # Step 1: Download
+    log "  [1/3] Downloading epoch ${epoch}..."
+    python scripts/download_epoch_artifacts.py --epoch "$epoch" || {
+        error "Failed to download epoch ${epoch}. Cannot continue."
+    }
+    # Step 2: Ingest (always pass --merge-neo4j; auto-detect handles empty DB)
+    log "  [2/3] Ingesting epoch ${epoch} into databases..."
+    python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
+        error "Ingestion failed for epoch ${epoch}. Cannot continue."
+    }
+    # Step 3: Delete parquet files to free disk space
+    log "  [3/3] Cleaning up epoch ${epoch} parquet files..."
+    rm -rf "$EPOCH_DIR"
+    # Show progress
+    CURRENT_MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
+    CURRENT_TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
+    log "  Progress: ${CURRENT_MINTS} mints, ${CURRENT_TRADES} trades"
+    log "  Disk free: $(df -h . | awk 'NR==2{print $4}')"
+done
+# Final verification
+log ""
+log "Verifying final data..."
+MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
+TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
+log "  📊 Mints:  ${MINTS}"
+log "  📊 Trades: ${TRADES}"
+success "All epochs processed and ingested"

.ipynb_checkpoints/pre_cache-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+# Pre-caches the dataset for training in context mode
+#
+# Usage:
+#   ./pre_cache.sh
+#
+# Arguments:
+#   --context_length N      Max sequence length, triggers H/B/H when exceeded (default: 4096)
+#   --min_trades N          Minimum trades for T_cutoff sampling (default: 10)
+#   --samples_per_token N   Number of T_cutoff samples per token (default: 1)
+# set -e  # Removed: was closing terminal on any error
+# Default values
+CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
+MIN_TRADES="${MIN_TRADES:-10}"
+SAMPLES_PER_TOKEN="${SAMPLES_PER_TOKEN:-1}"
+OHLC_STATS_PATH="${OHLC_STATS_PATH:-/workspace/apollo/data/ohlc_stats.npz}"
+OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
+echo "========================================"
+echo "Apollo Dataset Pre-Caching (Context Mode)"
+echo "========================================"
+echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
+echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
+echo "Samples per Token: $SAMPLES_PER_TOKEN"
+echo "Output Directory: $OUTPUT_DIR"
+echo "OHLC Stats Path: $OHLC_STATS_PATH"
+echo "========================================"
+echo "Starting dataset caching..."
+python3 scripts/cache_dataset.py \
+    --ohlc_stats_path "$OHLC_STATS_PATH" \
+    --output_dir "$OUTPUT_DIR" \
+    --context_length "$CONTEXT_LENGTH" \
+    --min_trades "$MIN_TRADES" \
+    --samples_per_token "$SAMPLES_PER_TOKEN" \
+    "$@"
+echo "Done!"
+echo "Cache saved to: $OUTPUT_DIR"

data/data_loader.py CHANGED Viewed

@@ -757,18 +757,23 @@ class OracleDataset(Dataset):
         wallet_addresses = valid_wallets
         # --- Collect all unique mints from holdings, split into top 10 + rest ---
         all_holding_mints = set()
         top_holding_mints = set()
         for wallet_addr in wallet_addresses:
             wallet_holds = holdings.get(wallet_addr, [])
             for holding_item in wallet_holds:
-                if 'mint_address' in holding_item:
-                    all_holding_mints.add(holding_item['mint_address'])
             # Pick top holdings by volume for full image processing
             sorted_holds = sorted(wallet_holds, key=lambda h: float(h.get('total_volume_usd', 0) or 0), reverse=True)
             for h in sorted_holds[:2]:
-                if h.get('mint_address'):
-                    top_holding_mints.add(h['mint_address'])
         # Cap top mints at 10 for full image processing
         top_holding_mints = set(list(top_holding_mints)[:10])
@@ -781,7 +786,12 @@ class OracleDataset(Dataset):
         rest_tokens = self._process_token_data_lightweight(list(rest_holding_mints), pooler, T_cutoff) if rest_holding_mints else {}
         processed_new_tokens = {**top_tokens, **rest_tokens}
         _wd_timings['holding_token_processing'] = _time.perf_counter() - _t0
-        all_token_data = {**token_data, **(processed_new_tokens or {})}
         # Print wallet_data sub-timings
         print(f"    [WALLET_DATA] db_fetch: {_wd_timings['db_fetch']*1000:.1f}ms, "
@@ -876,12 +886,41 @@ class OracleDataset(Dataset):
                 else:
                     holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
-                valid_wallet_holdings.append(holding_item)
             final_wallets[addr] = {
-                'profile': profile_data,
-                'socials': social_data,
                 'holdings': valid_wallet_holdings
             }
@@ -968,6 +1007,7 @@ class OracleDataset(Dataset):
             data['image_emb_idx'] = pooler.get_idx(image)
             data['name_emb_idx'] = pooler.get_idx(token_name)
             data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
             # FIX: Validate the protocol ID ---
             # The DB might return an ID that is out of bounds for our nn.Embedding layer.
@@ -1109,8 +1149,19 @@ class OracleDataset(Dataset):
         if not cached_data:
             raise RuntimeError(f"No data loaded for index {idx}")
-        # Auto-detect cache mode
-        cache_mode = cached_data.get('cache_mode', 'raw')
         if cache_mode == 'context':
             # CONTEXT MODE: Return pre-computed training context directly
@@ -1315,15 +1366,15 @@ class OracleDataset(Dataset):
         # --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
         _t0 = _time.perf_counter()
-        # Build token data from cache (no HTTP calls)
-        offline_token_data = {token_address: raw_data}
         # If we have cached image bytes, convert to PIL Image for the pooler
         cached_image_bytes = raw_data.get('cached_image_bytes')
         if cached_image_bytes:
             try:
                 cached_image = Image.open(BytesIO(cached_image_bytes))
-                raw_data['_cached_image_pil'] = cached_image  # Store for _process_token_data
             except Exception as e:
                 pass  # Image decoding failed, will use None
@@ -1458,6 +1509,8 @@ class OracleDataset(Dataset):
             data['image_emb_idx'] = pooler.get_idx(image)
             data['name_emb_idx'] = pooler.get_idx(token_name)
             data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
             # Validate protocol ID
             raw_protocol_id = data.get('protocol')
@@ -1470,6 +1523,23 @@ class OracleDataset(Dataset):
         return valid_token_data
     def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
         """
         Fetches cutoff-agnostic raw token data for caching/online sampling.
@@ -2768,8 +2838,8 @@ class OracleDataset(Dataset):
             # Build offline data for this context
             pooler = EmbeddingPooler()
-            # Process token data offline
-            offline_token_data = {token_address: raw_data.copy()}
             if cached_image_bytes:
                 try:
                     cached_image = Image.open(BytesIO(cached_image_bytes))
@@ -2832,9 +2902,6 @@ class OracleDataset(Dataset):
             )
             if result is not None:
-                # Store the T_cutoff used for this sample (for reproducibility tracking)
-                result['cached_t_cutoff_ts'] = sample_offset_ts
-                result['cached_sample_num'] = sample_num
                 results.append(result)
                 pass  # Per-context verbose logging removed for caching speed

         wallet_addresses = valid_wallets
         # --- Collect all unique mints from holdings, split into top 10 + rest ---
+        # Preserve seed token metadata (main token from mint record) and avoid refetching it
+        # from holdings/token snapshots, which may be sparse at early cutoffs.
+        seed_token_addresses = set(token_data.keys())
         all_holding_mints = set()
         top_holding_mints = set()
         for wallet_addr in wallet_addresses:
             wallet_holds = holdings.get(wallet_addr, [])
             for holding_item in wallet_holds:
+                mint_addr = holding_item.get('mint_address')
+                if mint_addr and mint_addr not in seed_token_addresses:
+                    all_holding_mints.add(mint_addr)
             # Pick top holdings by volume for full image processing
             sorted_holds = sorted(wallet_holds, key=lambda h: float(h.get('total_volume_usd', 0) or 0), reverse=True)
             for h in sorted_holds[:2]:
+                mint_addr = h.get('mint_address')
+                if mint_addr and mint_addr not in seed_token_addresses:
+                    top_holding_mints.add(mint_addr)
         # Cap top mints at 10 for full image processing
         top_holding_mints = set(list(top_holding_mints)[:10])
         rest_tokens = self._process_token_data_lightweight(list(rest_holding_mints), pooler, T_cutoff) if rest_holding_mints else {}
         processed_new_tokens = {**top_tokens, **rest_tokens}
         _wd_timings['holding_token_processing'] = _time.perf_counter() - _t0
+        # Defensive merge: never overwrite seed token metadata with holding-token fetches.
+        all_token_data = dict(token_data)
+        for addr, data in (processed_new_tokens or {}).items():
+            if addr in all_token_data:
+                continue
+            all_token_data[addr] = data
         # Print wallet_data sub-timings
         print(f"    [WALLET_DATA] db_fetch: {_wd_timings['db_fetch']*1000:.1f}ms, "
                 else:
                     holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
+                # Keep only fields used by WalletEncoder to minimize cache size.
+                compact_holding = {
+                    'mint_address': mint_addr,
+                    'holding_time': float(holding_item.get('holding_time', 0.0) or 0.0),
+                    'balance_pct_to_supply': float(holding_item.get('balance_pct_to_supply', 0.0) or 0.0),
+                    'history_bought_cost_sol': float(holding_item.get('history_bought_cost_sol', 0.0) or 0.0),
+                    'bought_amount_sol_pct_to_native_balance': float(holding_item.get('bought_amount_sol_pct_to_native_balance', 0.0) or 0.0),
+                    'history_total_buys': float(holding_item.get('history_total_buys', 0.0) or 0.0),
+                    'history_total_sells': float(holding_item.get('history_total_sells', 0.0) or 0.0),
+                    'realized_profit_pnl': float(holding_item.get('realized_profit_pnl', 0.0) or 0.0),
+                    'realized_profit_sol': float(holding_item.get('realized_profit_sol', 0.0) or 0.0),
+                    'history_transfer_in': float(holding_item.get('history_transfer_in', 0.0) or 0.0),
+                    'history_transfer_out': float(holding_item.get('history_transfer_out', 0.0) or 0.0),
+                    'avarage_trade_gap_seconds': float(holding_item.get('avarage_trade_gap_seconds', 0.0) or 0.0),
+                    'total_fees': float(holding_item.get('total_fees', 0.0) or 0.0),
+                }
+                valid_wallet_holdings.append(compact_holding)
+            # Keep only fields consumed by WalletEncoder.
+            compact_profile = {'wallet_address': addr}
+            for key in expected_profile_keys:
+                compact_profile[key] = float(profile_data.get(key, 0.0) or 0.0)
+            compact_profile['age'] = float(profile_data.get('age', 0.0) or 0.0)
+            compact_social = {
+                'has_pf_profile': bool(social_data.get('has_pf_profile', False)),
+                'has_twitter': bool(social_data.get('has_twitter', False)),
+                'has_telegram': bool(social_data.get('has_telegram', False)),
+                'is_exchange_wallet': bool(social_data.get('is_exchange_wallet', False)),
+                'username_emb_idx': int(social_data.get('username_emb_idx', 0) or 0),
+            }
             final_wallets[addr] = {
+                'profile': compact_profile,
+                'socials': compact_social,
                 'holdings': valid_wallet_holdings
             }
             data['image_emb_idx'] = pooler.get_idx(image)
             data['name_emb_idx'] = pooler.get_idx(token_name)
             data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
+            data.pop('_cached_image_pil', None)
             # FIX: Validate the protocol ID ---
             # The DB might return an ID that is out of bounds for our nn.Embedding layer.
         if not cached_data:
             raise RuntimeError(f"No data loaded for index {idx}")
+        # Auto-detect cache mode. New compact context cache may omit 'cache_mode'.
+        if 'cache_mode' in cached_data:
+            cache_mode = cached_data.get('cache_mode', 'raw')
+        else:
+            has_context_shape = (
+                isinstance(cached_data, dict) and
+                'event_sequence' in cached_data and
+                'tokens' in cached_data and
+                'wallets' in cached_data and
+                'labels' in cached_data and
+                'labels_mask' in cached_data
+            )
+            cache_mode = 'context' if has_context_shape else 'raw'
         if cache_mode == 'context':
             # CONTEXT MODE: Return pre-computed training context directly
         # --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
         _t0 = _time.perf_counter()
+        # Build minimal main token metadata from cache (no HTTP calls)
+        offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
         # If we have cached image bytes, convert to PIL Image for the pooler
         cached_image_bytes = raw_data.get('cached_image_bytes')
         if cached_image_bytes:
             try:
                 cached_image = Image.open(BytesIO(cached_image_bytes))
+                offline_token_data[token_address]['_cached_image_pil'] = cached_image
             except Exception as e:
                 pass  # Image decoding failed, will use None
             data['image_emb_idx'] = pooler.get_idx(image)
             data['name_emb_idx'] = pooler.get_idx(token_name)
             data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
+            # Drop transient in-memory image object from cache payload.
+            data.pop('_cached_image_pil', None)
             # Validate protocol ID
             raw_protocol_id = data.get('protocol')
         return valid_token_data
+    def _build_main_token_seed(self, token_address: str, raw_data: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Build a minimal token metadata payload for the main token.
+        Prevents raw cache blobs (trades/snapshots/etc.) from leaking into
+        sample['tokens'][main_token].
+        """
+        return {
+            'token_address': token_address,
+            'address': token_address,
+            'name': raw_data.get('name', ''),
+            'symbol': raw_data.get('symbol', ''),
+            'token_uri': raw_data.get('token_uri', ''),
+            'protocol': raw_data.get('protocol', 1),
+            'total_supply': raw_data.get('total_supply', 0),
+            'decimals': raw_data.get('decimals', 6),
+        }
     def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
         """
         Fetches cutoff-agnostic raw token data for caching/online sampling.
             # Build offline data for this context
             pooler = EmbeddingPooler()
+            # Process token data offline (minimal main token metadata only)
+            offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
             if cached_image_bytes:
                 try:
                     cached_image = Image.open(BytesIO(cached_image_bytes))
             )
             if result is not None:
                 results.append(result)
                 pass  # Per-context verbose logging removed for caching speed

data/ohlc_stats.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:460520e03b3be81e290a6356069bfc80e6e3e4870aa5baf74ecb1c33f26bf41c
 size 1660

 version https://git-lfs.github.com/spec/v1
+oid sha256:1d6de1f1622fbf061842fa4227e2b98784c9cec39ed647f4b87df2ad5eef6e47
 size 1660

log.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c48dfd4947f51a5bbb11626480fe6b3bf92a0c5d0d1fcd1f3dc482e23201b73
-size 645

 version https://git-lfs.github.com/spec/v1
+oid sha256:21f6421b07eb49c1e0a5518a628403ce0ae7149fb81a600aebad2dfcaf0313c9
+size 2854

pre_cache.sh CHANGED Viewed

@@ -3,20 +3,24 @@
 #
 # Usage:
 #   ./pre_cache.sh
-#
-# Arguments:
-#   --context_length N      Max sequence length, triggers H/B/H when exceeded (default: 4096)
-#   --min_trades N          Minimum trades for T_cutoff sampling (default: 10)
-#   --samples_per_token N   Number of T_cutoff samples per token (default: 1)
-# set -e  # Removed: was closing terminal on any error
-# Default values
-CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
-MIN_TRADES="${MIN_TRADES:-10}"
-SAMPLES_PER_TOKEN="${SAMPLES_PER_TOKEN:-1}"
-OHLC_STATS_PATH="${OHLC_STATS_PATH:-/workspace/apollo/data/ohlc_stats.npz}"
-OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
 echo "========================================"
 echo "Apollo Dataset Pre-Caching (Context Mode)"
@@ -24,18 +28,26 @@ echo "========================================"
 echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
 echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
 echo "Samples per Token: $SAMPLES_PER_TOKEN"
 echo "Output Directory: $OUTPUT_DIR"
 echo "OHLC Stats Path: $OHLC_STATS_PATH"
 echo "========================================"
 echo "Starting dataset caching..."
 python3 scripts/cache_dataset.py \
     --ohlc_stats_path "$OHLC_STATS_PATH" \
     --output_dir "$OUTPUT_DIR" \
     --context_length "$CONTEXT_LENGTH" \
     --min_trades "$MIN_TRADES" \
     --samples_per_token "$SAMPLES_PER_TOKEN" \
     "$@"
 echo "Done!"

 #
 # Usage:
 #   ./pre_cache.sh
+set -euo pipefail
+# =========================
+# Hardcoded cache settings
+# =========================
+CONTEXT_LENGTH=4096
+MIN_TRADES=10
+SAMPLES_PER_TOKEN=1
+NUM_WORKERS=1
+OHLC_STATS_PATH="/workspace/apollo/data/ohlc_stats.npz"
+OUTPUT_DIR="data/cache"
+# Label horizons in seconds, relative to each sampled T_cutoff.
+# Tuned for memecoin timing distribution (less micro-noise, more actionable windows):
+# [300, 900, 1800, 3600, 7200] = [5m, 15m, 30m, 60m, 120m]
+HORIZONS_SECONDS=(300 900 1800 3600 7200)
+QUANTILES=(0.1 0.5 0.9)
 echo "========================================"
 echo "Apollo Dataset Pre-Caching (Context Mode)"
 echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
 echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
 echo "Samples per Token: $SAMPLES_PER_TOKEN"
+echo "Num Workers: $NUM_WORKERS"
+echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
+echo "Quantiles: ${QUANTILES[*]}"
 echo "Output Directory: $OUTPUT_DIR"
 echo "OHLC Stats Path: $OHLC_STATS_PATH"
 echo "========================================"
 echo "Starting dataset caching..."
+mkdir -p "$OUTPUT_DIR"
 python3 scripts/cache_dataset.py \
     --ohlc_stats_path "$OHLC_STATS_PATH" \
     --output_dir "$OUTPUT_DIR" \
     --context_length "$CONTEXT_LENGTH" \
     --min_trades "$MIN_TRADES" \
     --samples_per_token "$SAMPLES_PER_TOKEN" \
+    --num_workers "$NUM_WORKERS" \
+    --horizons_seconds "${HORIZONS_SECONDS[@]}" \
+    --quantiles "${QUANTILES[@]}" \
     "$@"
 echo "Done!"

sample_36zVkFUXEhdKW5Nz_0.json ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/cache_dataset.py CHANGED Viewed

@@ -73,8 +73,6 @@ def _process_single_token_context(args):
         for ctx_idx, ctx in enumerate(contexts):
             ctx["quality_score"] = q_score
             ctx["class_id"] = class_id
-            ctx["source_token"] = mint_addr
-            ctx["cache_mode"] = "context"
             filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
             output_path = Path(output_dir) / filename
             torch.save(ctx, output_path)

         for ctx_idx, ctx in enumerate(contexts):
             ctx["quality_score"] = q_score
             ctx["class_id"] = class_id
             filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
             output_path = Path(output_dir) / filename
             torch.save(ctx, output_path)