Upload folder using huggingface_hub
Browse files- .gitignore +1 -1
- .ipynb_checkpoints/ingest-checkpoint.sh +67 -0
- .ipynb_checkpoints/pre_cache-checkpoint.sh +42 -0
- data/data_loader.py +85 -18
- data/ohlc_stats.npz +1 -1
- log.log +2 -2
- pre_cache.sh +24 -12
- sample_36zVkFUXEhdKW5Nz_0.json +0 -0
- scripts/cache_dataset.py +0 -2
.gitignore
CHANGED
|
@@ -16,4 +16,4 @@ checkpoints/
|
|
| 16 |
metadata/
|
| 17 |
store/
|
| 18 |
preprocessed_configs/
|
| 19 |
-
.early.coverage
|
|
|
|
| 16 |
metadata/
|
| 17 |
store/
|
| 18 |
preprocessed_configs/
|
| 19 |
+
.early.coverage
|
.ipynb_checkpoints/ingest-checkpoint.sh
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -e
|
| 3 |
+
trap 'read -p "Press Enter to exit..."' EXIT
|
| 4 |
+
|
| 5 |
+
# Colors
|
| 6 |
+
RED='\033[0;31m'
|
| 7 |
+
GREEN='\033[0;32m'
|
| 8 |
+
BLUE='\033[0;34m'
|
| 9 |
+
CYAN='\033[0;36m'
|
| 10 |
+
NC='\033[0m'
|
| 11 |
+
|
| 12 |
+
# Helper functions
|
| 13 |
+
header() { echo -e "\n${CYAN}========================================${NC}\n${CYAN} $1${NC}\n${CYAN}========================================${NC}\n"; }
|
| 14 |
+
log() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
| 15 |
+
success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
|
| 16 |
+
error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
|
| 17 |
+
|
| 18 |
+
#===============================================================================
|
| 19 |
+
# Step 5+6: Download, Ingest, Delete (one epoch at a time to save disk)
|
| 20 |
+
#===============================================================================
|
| 21 |
+
header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
|
| 22 |
+
|
| 23 |
+
EPOCHS=(844 845 846)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
log "Processing epochs one at a time to minimize disk usage..."
|
| 27 |
+
log "Each epoch: ~20GB download → ingest → delete"
|
| 28 |
+
echo ""
|
| 29 |
+
|
| 30 |
+
for epoch in "${EPOCHS[@]}"; do
|
| 31 |
+
EPOCH_DIR="./data/pump_fun/epoch_${epoch}"
|
| 32 |
+
|
| 33 |
+
log "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
|
| 34 |
+
log "Processing epoch ${epoch}..."
|
| 35 |
+
|
| 36 |
+
# Step 1: Download
|
| 37 |
+
log " [1/3] Downloading epoch ${epoch}..."
|
| 38 |
+
python scripts/download_epoch_artifacts.py --epoch "$epoch" || {
|
| 39 |
+
error "Failed to download epoch ${epoch}. Cannot continue."
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# Step 2: Ingest (always pass --merge-neo4j; auto-detect handles empty DB)
|
| 43 |
+
log " [2/3] Ingesting epoch ${epoch} into databases..."
|
| 44 |
+
python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
|
| 45 |
+
error "Ingestion failed for epoch ${epoch}. Cannot continue."
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Step 3: Delete parquet files to free disk space
|
| 49 |
+
log " [3/3] Cleaning up epoch ${epoch} parquet files..."
|
| 50 |
+
rm -rf "$EPOCH_DIR"
|
| 51 |
+
|
| 52 |
+
# Show progress
|
| 53 |
+
CURRENT_MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
|
| 54 |
+
CURRENT_TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
|
| 55 |
+
log " Progress: ${CURRENT_MINTS} mints, ${CURRENT_TRADES} trades"
|
| 56 |
+
log " Disk free: $(df -h . | awk 'NR==2{print $4}')"
|
| 57 |
+
done
|
| 58 |
+
|
| 59 |
+
# Final verification
|
| 60 |
+
log ""
|
| 61 |
+
log "Verifying final data..."
|
| 62 |
+
MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
|
| 63 |
+
TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
|
| 64 |
+
log " 📊 Mints: ${MINTS}"
|
| 65 |
+
log " 📊 Trades: ${TRADES}"
|
| 66 |
+
|
| 67 |
+
success "All epochs processed and ingested"
|
.ipynb_checkpoints/pre_cache-checkpoint.sh
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Pre-caches the dataset for training in context mode
|
| 3 |
+
#
|
| 4 |
+
# Usage:
|
| 5 |
+
# ./pre_cache.sh
|
| 6 |
+
#
|
| 7 |
+
# Arguments:
|
| 8 |
+
# --context_length N Max sequence length, triggers H/B/H when exceeded (default: 4096)
|
| 9 |
+
# --min_trades N Minimum trades for T_cutoff sampling (default: 10)
|
| 10 |
+
# --samples_per_token N Number of T_cutoff samples per token (default: 1)
|
| 11 |
+
|
| 12 |
+
# set -e # Removed: was closing terminal on any error
|
| 13 |
+
|
| 14 |
+
# Default values
|
| 15 |
+
CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
|
| 16 |
+
MIN_TRADES="${MIN_TRADES:-10}"
|
| 17 |
+
SAMPLES_PER_TOKEN="${SAMPLES_PER_TOKEN:-1}"
|
| 18 |
+
OHLC_STATS_PATH="${OHLC_STATS_PATH:-/workspace/apollo/data/ohlc_stats.npz}"
|
| 19 |
+
OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
|
| 20 |
+
|
| 21 |
+
echo "========================================"
|
| 22 |
+
echo "Apollo Dataset Pre-Caching (Context Mode)"
|
| 23 |
+
echo "========================================"
|
| 24 |
+
echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
|
| 25 |
+
echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
|
| 26 |
+
echo "Samples per Token: $SAMPLES_PER_TOKEN"
|
| 27 |
+
echo "Output Directory: $OUTPUT_DIR"
|
| 28 |
+
echo "OHLC Stats Path: $OHLC_STATS_PATH"
|
| 29 |
+
echo "========================================"
|
| 30 |
+
|
| 31 |
+
echo "Starting dataset caching..."
|
| 32 |
+
|
| 33 |
+
python3 scripts/cache_dataset.py \
|
| 34 |
+
--ohlc_stats_path "$OHLC_STATS_PATH" \
|
| 35 |
+
--output_dir "$OUTPUT_DIR" \
|
| 36 |
+
--context_length "$CONTEXT_LENGTH" \
|
| 37 |
+
--min_trades "$MIN_TRADES" \
|
| 38 |
+
--samples_per_token "$SAMPLES_PER_TOKEN" \
|
| 39 |
+
"$@"
|
| 40 |
+
|
| 41 |
+
echo "Done!"
|
| 42 |
+
echo "Cache saved to: $OUTPUT_DIR"
|
data/data_loader.py
CHANGED
|
@@ -757,18 +757,23 @@ class OracleDataset(Dataset):
|
|
| 757 |
wallet_addresses = valid_wallets
|
| 758 |
|
| 759 |
# --- Collect all unique mints from holdings, split into top 10 + rest ---
|
|
|
|
|
|
|
|
|
|
| 760 |
all_holding_mints = set()
|
| 761 |
top_holding_mints = set()
|
| 762 |
for wallet_addr in wallet_addresses:
|
| 763 |
wallet_holds = holdings.get(wallet_addr, [])
|
| 764 |
for holding_item in wallet_holds:
|
| 765 |
-
|
| 766 |
-
|
|
|
|
| 767 |
# Pick top holdings by volume for full image processing
|
| 768 |
sorted_holds = sorted(wallet_holds, key=lambda h: float(h.get('total_volume_usd', 0) or 0), reverse=True)
|
| 769 |
for h in sorted_holds[:2]:
|
| 770 |
-
|
| 771 |
-
|
|
|
|
| 772 |
|
| 773 |
# Cap top mints at 10 for full image processing
|
| 774 |
top_holding_mints = set(list(top_holding_mints)[:10])
|
|
@@ -781,7 +786,12 @@ class OracleDataset(Dataset):
|
|
| 781 |
rest_tokens = self._process_token_data_lightweight(list(rest_holding_mints), pooler, T_cutoff) if rest_holding_mints else {}
|
| 782 |
processed_new_tokens = {**top_tokens, **rest_tokens}
|
| 783 |
_wd_timings['holding_token_processing'] = _time.perf_counter() - _t0
|
| 784 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 785 |
|
| 786 |
# Print wallet_data sub-timings
|
| 787 |
print(f" [WALLET_DATA] db_fetch: {_wd_timings['db_fetch']*1000:.1f}ms, "
|
|
@@ -876,12 +886,41 @@ class OracleDataset(Dataset):
|
|
| 876 |
else:
|
| 877 |
holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
|
| 878 |
|
| 879 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 880 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
|
| 882 |
final_wallets[addr] = {
|
| 883 |
-
'profile':
|
| 884 |
-
'socials':
|
| 885 |
'holdings': valid_wallet_holdings
|
| 886 |
}
|
| 887 |
|
|
@@ -968,6 +1007,7 @@ class OracleDataset(Dataset):
|
|
| 968 |
data['image_emb_idx'] = pooler.get_idx(image)
|
| 969 |
data['name_emb_idx'] = pooler.get_idx(token_name)
|
| 970 |
data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
|
|
|
|
| 971 |
|
| 972 |
# FIX: Validate the protocol ID ---
|
| 973 |
# The DB might return an ID that is out of bounds for our nn.Embedding layer.
|
|
@@ -1109,8 +1149,19 @@ class OracleDataset(Dataset):
|
|
| 1109 |
if not cached_data:
|
| 1110 |
raise RuntimeError(f"No data loaded for index {idx}")
|
| 1111 |
|
| 1112 |
-
# Auto-detect cache mode
|
| 1113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1114 |
|
| 1115 |
if cache_mode == 'context':
|
| 1116 |
# CONTEXT MODE: Return pre-computed training context directly
|
|
@@ -1315,15 +1366,15 @@ class OracleDataset(Dataset):
|
|
| 1315 |
# --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
|
| 1316 |
_t0 = _time.perf_counter()
|
| 1317 |
|
| 1318 |
-
# Build token
|
| 1319 |
-
offline_token_data = {token_address: raw_data}
|
| 1320 |
|
| 1321 |
# If we have cached image bytes, convert to PIL Image for the pooler
|
| 1322 |
cached_image_bytes = raw_data.get('cached_image_bytes')
|
| 1323 |
if cached_image_bytes:
|
| 1324 |
try:
|
| 1325 |
cached_image = Image.open(BytesIO(cached_image_bytes))
|
| 1326 |
-
|
| 1327 |
except Exception as e:
|
| 1328 |
pass # Image decoding failed, will use None
|
| 1329 |
|
|
@@ -1458,6 +1509,8 @@ class OracleDataset(Dataset):
|
|
| 1458 |
data['image_emb_idx'] = pooler.get_idx(image)
|
| 1459 |
data['name_emb_idx'] = pooler.get_idx(token_name)
|
| 1460 |
data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
|
|
|
|
|
|
|
| 1461 |
|
| 1462 |
# Validate protocol ID
|
| 1463 |
raw_protocol_id = data.get('protocol')
|
|
@@ -1470,6 +1523,23 @@ class OracleDataset(Dataset):
|
|
| 1470 |
|
| 1471 |
return valid_token_data
|
| 1472 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1473 |
def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
|
| 1474 |
"""
|
| 1475 |
Fetches cutoff-agnostic raw token data for caching/online sampling.
|
|
@@ -2768,8 +2838,8 @@ class OracleDataset(Dataset):
|
|
| 2768 |
# Build offline data for this context
|
| 2769 |
pooler = EmbeddingPooler()
|
| 2770 |
|
| 2771 |
-
# Process token data offline
|
| 2772 |
-
offline_token_data = {token_address:
|
| 2773 |
if cached_image_bytes:
|
| 2774 |
try:
|
| 2775 |
cached_image = Image.open(BytesIO(cached_image_bytes))
|
|
@@ -2832,9 +2902,6 @@ class OracleDataset(Dataset):
|
|
| 2832 |
)
|
| 2833 |
|
| 2834 |
if result is not None:
|
| 2835 |
-
# Store the T_cutoff used for this sample (for reproducibility tracking)
|
| 2836 |
-
result['cached_t_cutoff_ts'] = sample_offset_ts
|
| 2837 |
-
result['cached_sample_num'] = sample_num
|
| 2838 |
results.append(result)
|
| 2839 |
pass # Per-context verbose logging removed for caching speed
|
| 2840 |
|
|
|
|
| 757 |
wallet_addresses = valid_wallets
|
| 758 |
|
| 759 |
# --- Collect all unique mints from holdings, split into top 10 + rest ---
|
| 760 |
+
# Preserve seed token metadata (main token from mint record) and avoid refetching it
|
| 761 |
+
# from holdings/token snapshots, which may be sparse at early cutoffs.
|
| 762 |
+
seed_token_addresses = set(token_data.keys())
|
| 763 |
all_holding_mints = set()
|
| 764 |
top_holding_mints = set()
|
| 765 |
for wallet_addr in wallet_addresses:
|
| 766 |
wallet_holds = holdings.get(wallet_addr, [])
|
| 767 |
for holding_item in wallet_holds:
|
| 768 |
+
mint_addr = holding_item.get('mint_address')
|
| 769 |
+
if mint_addr and mint_addr not in seed_token_addresses:
|
| 770 |
+
all_holding_mints.add(mint_addr)
|
| 771 |
# Pick top holdings by volume for full image processing
|
| 772 |
sorted_holds = sorted(wallet_holds, key=lambda h: float(h.get('total_volume_usd', 0) or 0), reverse=True)
|
| 773 |
for h in sorted_holds[:2]:
|
| 774 |
+
mint_addr = h.get('mint_address')
|
| 775 |
+
if mint_addr and mint_addr not in seed_token_addresses:
|
| 776 |
+
top_holding_mints.add(mint_addr)
|
| 777 |
|
| 778 |
# Cap top mints at 10 for full image processing
|
| 779 |
top_holding_mints = set(list(top_holding_mints)[:10])
|
|
|
|
| 786 |
rest_tokens = self._process_token_data_lightweight(list(rest_holding_mints), pooler, T_cutoff) if rest_holding_mints else {}
|
| 787 |
processed_new_tokens = {**top_tokens, **rest_tokens}
|
| 788 |
_wd_timings['holding_token_processing'] = _time.perf_counter() - _t0
|
| 789 |
+
# Defensive merge: never overwrite seed token metadata with holding-token fetches.
|
| 790 |
+
all_token_data = dict(token_data)
|
| 791 |
+
for addr, data in (processed_new_tokens or {}).items():
|
| 792 |
+
if addr in all_token_data:
|
| 793 |
+
continue
|
| 794 |
+
all_token_data[addr] = data
|
| 795 |
|
| 796 |
# Print wallet_data sub-timings
|
| 797 |
print(f" [WALLET_DATA] db_fetch: {_wd_timings['db_fetch']*1000:.1f}ms, "
|
|
|
|
| 886 |
else:
|
| 887 |
holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
|
| 888 |
|
| 889 |
+
# Keep only fields used by WalletEncoder to minimize cache size.
|
| 890 |
+
compact_holding = {
|
| 891 |
+
'mint_address': mint_addr,
|
| 892 |
+
'holding_time': float(holding_item.get('holding_time', 0.0) or 0.0),
|
| 893 |
+
'balance_pct_to_supply': float(holding_item.get('balance_pct_to_supply', 0.0) or 0.0),
|
| 894 |
+
'history_bought_cost_sol': float(holding_item.get('history_bought_cost_sol', 0.0) or 0.0),
|
| 895 |
+
'bought_amount_sol_pct_to_native_balance': float(holding_item.get('bought_amount_sol_pct_to_native_balance', 0.0) or 0.0),
|
| 896 |
+
'history_total_buys': float(holding_item.get('history_total_buys', 0.0) or 0.0),
|
| 897 |
+
'history_total_sells': float(holding_item.get('history_total_sells', 0.0) or 0.0),
|
| 898 |
+
'realized_profit_pnl': float(holding_item.get('realized_profit_pnl', 0.0) or 0.0),
|
| 899 |
+
'realized_profit_sol': float(holding_item.get('realized_profit_sol', 0.0) or 0.0),
|
| 900 |
+
'history_transfer_in': float(holding_item.get('history_transfer_in', 0.0) or 0.0),
|
| 901 |
+
'history_transfer_out': float(holding_item.get('history_transfer_out', 0.0) or 0.0),
|
| 902 |
+
'avarage_trade_gap_seconds': float(holding_item.get('avarage_trade_gap_seconds', 0.0) or 0.0),
|
| 903 |
+
'total_fees': float(holding_item.get('total_fees', 0.0) or 0.0),
|
| 904 |
+
}
|
| 905 |
+
valid_wallet_holdings.append(compact_holding)
|
| 906 |
|
| 907 |
+
# Keep only fields consumed by WalletEncoder.
|
| 908 |
+
compact_profile = {'wallet_address': addr}
|
| 909 |
+
for key in expected_profile_keys:
|
| 910 |
+
compact_profile[key] = float(profile_data.get(key, 0.0) or 0.0)
|
| 911 |
+
compact_profile['age'] = float(profile_data.get('age', 0.0) or 0.0)
|
| 912 |
+
|
| 913 |
+
compact_social = {
|
| 914 |
+
'has_pf_profile': bool(social_data.get('has_pf_profile', False)),
|
| 915 |
+
'has_twitter': bool(social_data.get('has_twitter', False)),
|
| 916 |
+
'has_telegram': bool(social_data.get('has_telegram', False)),
|
| 917 |
+
'is_exchange_wallet': bool(social_data.get('is_exchange_wallet', False)),
|
| 918 |
+
'username_emb_idx': int(social_data.get('username_emb_idx', 0) or 0),
|
| 919 |
+
}
|
| 920 |
|
| 921 |
final_wallets[addr] = {
|
| 922 |
+
'profile': compact_profile,
|
| 923 |
+
'socials': compact_social,
|
| 924 |
'holdings': valid_wallet_holdings
|
| 925 |
}
|
| 926 |
|
|
|
|
| 1007 |
data['image_emb_idx'] = pooler.get_idx(image)
|
| 1008 |
data['name_emb_idx'] = pooler.get_idx(token_name)
|
| 1009 |
data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
|
| 1010 |
+
data.pop('_cached_image_pil', None)
|
| 1011 |
|
| 1012 |
# FIX: Validate the protocol ID ---
|
| 1013 |
# The DB might return an ID that is out of bounds for our nn.Embedding layer.
|
|
|
|
| 1149 |
if not cached_data:
|
| 1150 |
raise RuntimeError(f"No data loaded for index {idx}")
|
| 1151 |
|
| 1152 |
+
# Auto-detect cache mode. New compact context cache may omit 'cache_mode'.
|
| 1153 |
+
if 'cache_mode' in cached_data:
|
| 1154 |
+
cache_mode = cached_data.get('cache_mode', 'raw')
|
| 1155 |
+
else:
|
| 1156 |
+
has_context_shape = (
|
| 1157 |
+
isinstance(cached_data, dict) and
|
| 1158 |
+
'event_sequence' in cached_data and
|
| 1159 |
+
'tokens' in cached_data and
|
| 1160 |
+
'wallets' in cached_data and
|
| 1161 |
+
'labels' in cached_data and
|
| 1162 |
+
'labels_mask' in cached_data
|
| 1163 |
+
)
|
| 1164 |
+
cache_mode = 'context' if has_context_shape else 'raw'
|
| 1165 |
|
| 1166 |
if cache_mode == 'context':
|
| 1167 |
# CONTEXT MODE: Return pre-computed training context directly
|
|
|
|
| 1366 |
# --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
|
| 1367 |
_t0 = _time.perf_counter()
|
| 1368 |
|
| 1369 |
+
# Build minimal main token metadata from cache (no HTTP calls)
|
| 1370 |
+
offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
|
| 1371 |
|
| 1372 |
# If we have cached image bytes, convert to PIL Image for the pooler
|
| 1373 |
cached_image_bytes = raw_data.get('cached_image_bytes')
|
| 1374 |
if cached_image_bytes:
|
| 1375 |
try:
|
| 1376 |
cached_image = Image.open(BytesIO(cached_image_bytes))
|
| 1377 |
+
offline_token_data[token_address]['_cached_image_pil'] = cached_image
|
| 1378 |
except Exception as e:
|
| 1379 |
pass # Image decoding failed, will use None
|
| 1380 |
|
|
|
|
| 1509 |
data['image_emb_idx'] = pooler.get_idx(image)
|
| 1510 |
data['name_emb_idx'] = pooler.get_idx(token_name)
|
| 1511 |
data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
|
| 1512 |
+
# Drop transient in-memory image object from cache payload.
|
| 1513 |
+
data.pop('_cached_image_pil', None)
|
| 1514 |
|
| 1515 |
# Validate protocol ID
|
| 1516 |
raw_protocol_id = data.get('protocol')
|
|
|
|
| 1523 |
|
| 1524 |
return valid_token_data
|
| 1525 |
|
| 1526 |
+
def _build_main_token_seed(self, token_address: str, raw_data: Dict[str, Any]) -> Dict[str, Any]:
|
| 1527 |
+
"""
|
| 1528 |
+
Build a minimal token metadata payload for the main token.
|
| 1529 |
+
Prevents raw cache blobs (trades/snapshots/etc.) from leaking into
|
| 1530 |
+
sample['tokens'][main_token].
|
| 1531 |
+
"""
|
| 1532 |
+
return {
|
| 1533 |
+
'token_address': token_address,
|
| 1534 |
+
'address': token_address,
|
| 1535 |
+
'name': raw_data.get('name', ''),
|
| 1536 |
+
'symbol': raw_data.get('symbol', ''),
|
| 1537 |
+
'token_uri': raw_data.get('token_uri', ''),
|
| 1538 |
+
'protocol': raw_data.get('protocol', 1),
|
| 1539 |
+
'total_supply': raw_data.get('total_supply', 0),
|
| 1540 |
+
'decimals': raw_data.get('decimals', 6),
|
| 1541 |
+
}
|
| 1542 |
+
|
| 1543 |
def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
|
| 1544 |
"""
|
| 1545 |
Fetches cutoff-agnostic raw token data for caching/online sampling.
|
|
|
|
| 2838 |
# Build offline data for this context
|
| 2839 |
pooler = EmbeddingPooler()
|
| 2840 |
|
| 2841 |
+
# Process token data offline (minimal main token metadata only)
|
| 2842 |
+
offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
|
| 2843 |
if cached_image_bytes:
|
| 2844 |
try:
|
| 2845 |
cached_image = Image.open(BytesIO(cached_image_bytes))
|
|
|
|
| 2902 |
)
|
| 2903 |
|
| 2904 |
if result is not None:
|
|
|
|
|
|
|
|
|
|
| 2905 |
results.append(result)
|
| 2906 |
pass # Per-context verbose logging removed for caching speed
|
| 2907 |
|
data/ohlc_stats.npz
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1660
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d6de1f1622fbf061842fa4227e2b98784c9cec39ed647f4b87df2ad5eef6e47
|
| 3 |
size 1660
|
log.log
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21f6421b07eb49c1e0a5518a628403ce0ae7149fb81a600aebad2dfcaf0313c9
|
| 3 |
+
size 2854
|
pre_cache.sh
CHANGED
|
@@ -3,20 +3,24 @@
|
|
| 3 |
#
|
| 4 |
# Usage:
|
| 5 |
# ./pre_cache.sh
|
| 6 |
-
#
|
| 7 |
-
# Arguments:
|
| 8 |
-
# --context_length N Max sequence length, triggers H/B/H when exceeded (default: 4096)
|
| 9 |
-
# --min_trades N Minimum trades for T_cutoff sampling (default: 10)
|
| 10 |
-
# --samples_per_token N Number of T_cutoff samples per token (default: 1)
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
#
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
|
| 20 |
|
| 21 |
echo "========================================"
|
| 22 |
echo "Apollo Dataset Pre-Caching (Context Mode)"
|
|
@@ -24,18 +28,26 @@ echo "========================================"
|
|
| 24 |
echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
|
| 25 |
echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
|
| 26 |
echo "Samples per Token: $SAMPLES_PER_TOKEN"
|
|
|
|
|
|
|
|
|
|
| 27 |
echo "Output Directory: $OUTPUT_DIR"
|
| 28 |
echo "OHLC Stats Path: $OHLC_STATS_PATH"
|
| 29 |
echo "========================================"
|
| 30 |
|
| 31 |
echo "Starting dataset caching..."
|
| 32 |
|
|
|
|
|
|
|
| 33 |
python3 scripts/cache_dataset.py \
|
| 34 |
--ohlc_stats_path "$OHLC_STATS_PATH" \
|
| 35 |
--output_dir "$OUTPUT_DIR" \
|
| 36 |
--context_length "$CONTEXT_LENGTH" \
|
| 37 |
--min_trades "$MIN_TRADES" \
|
| 38 |
--samples_per_token "$SAMPLES_PER_TOKEN" \
|
|
|
|
|
|
|
|
|
|
| 39 |
"$@"
|
| 40 |
|
| 41 |
echo "Done!"
|
|
|
|
| 3 |
#
|
| 4 |
# Usage:
|
| 5 |
# ./pre_cache.sh
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
set -euo pipefail
|
| 8 |
+
|
| 9 |
+
# =========================
|
| 10 |
+
# Hardcoded cache settings
|
| 11 |
+
# =========================
|
| 12 |
+
CONTEXT_LENGTH=4096
|
| 13 |
+
MIN_TRADES=10
|
| 14 |
+
SAMPLES_PER_TOKEN=1
|
| 15 |
+
NUM_WORKERS=1
|
| 16 |
+
OHLC_STATS_PATH="/workspace/apollo/data/ohlc_stats.npz"
|
| 17 |
+
OUTPUT_DIR="data/cache"
|
| 18 |
|
| 19 |
+
# Label horizons in seconds, relative to each sampled T_cutoff.
|
| 20 |
+
# Tuned for memecoin timing distribution (less micro-noise, more actionable windows):
|
| 21 |
+
# [300, 900, 1800, 3600, 7200] = [5m, 15m, 30m, 60m, 120m]
|
| 22 |
+
HORIZONS_SECONDS=(300 900 1800 3600 7200)
|
| 23 |
+
QUANTILES=(0.1 0.5 0.9)
|
|
|
|
| 24 |
|
| 25 |
echo "========================================"
|
| 26 |
echo "Apollo Dataset Pre-Caching (Context Mode)"
|
|
|
|
| 28 |
echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
|
| 29 |
echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
|
| 30 |
echo "Samples per Token: $SAMPLES_PER_TOKEN"
|
| 31 |
+
echo "Num Workers: $NUM_WORKERS"
|
| 32 |
+
echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
|
| 33 |
+
echo "Quantiles: ${QUANTILES[*]}"
|
| 34 |
echo "Output Directory: $OUTPUT_DIR"
|
| 35 |
echo "OHLC Stats Path: $OHLC_STATS_PATH"
|
| 36 |
echo "========================================"
|
| 37 |
|
| 38 |
echo "Starting dataset caching..."
|
| 39 |
|
| 40 |
+
mkdir -p "$OUTPUT_DIR"
|
| 41 |
+
|
| 42 |
python3 scripts/cache_dataset.py \
|
| 43 |
--ohlc_stats_path "$OHLC_STATS_PATH" \
|
| 44 |
--output_dir "$OUTPUT_DIR" \
|
| 45 |
--context_length "$CONTEXT_LENGTH" \
|
| 46 |
--min_trades "$MIN_TRADES" \
|
| 47 |
--samples_per_token "$SAMPLES_PER_TOKEN" \
|
| 48 |
+
--num_workers "$NUM_WORKERS" \
|
| 49 |
+
--horizons_seconds "${HORIZONS_SECONDS[@]}" \
|
| 50 |
+
--quantiles "${QUANTILES[@]}" \
|
| 51 |
"$@"
|
| 52 |
|
| 53 |
echo "Done!"
|
sample_36zVkFUXEhdKW5Nz_0.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
scripts/cache_dataset.py
CHANGED
|
@@ -73,8 +73,6 @@ def _process_single_token_context(args):
|
|
| 73 |
for ctx_idx, ctx in enumerate(contexts):
|
| 74 |
ctx["quality_score"] = q_score
|
| 75 |
ctx["class_id"] = class_id
|
| 76 |
-
ctx["source_token"] = mint_addr
|
| 77 |
-
ctx["cache_mode"] = "context"
|
| 78 |
filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
|
| 79 |
output_path = Path(output_dir) / filename
|
| 80 |
torch.save(ctx, output_path)
|
|
|
|
| 73 |
for ctx_idx, ctx in enumerate(contexts):
|
| 74 |
ctx["quality_score"] = q_score
|
| 75 |
ctx["class_id"] = class_id
|
|
|
|
|
|
|
| 76 |
filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
|
| 77 |
output_path = Path(output_dir) / filename
|
| 78 |
torch.save(ctx, output_path)
|