zirobtc commited on
Commit
4ec4791
·
1 Parent(s): 64ff574

Upload folder using huggingface_hub

Browse files
.gitignore CHANGED
@@ -16,4 +16,4 @@ checkpoints/
16
  metadata/
17
  store/
18
  preprocessed_configs/
19
- .early.coverage
 
16
  metadata/
17
  store/
18
  preprocessed_configs/
19
+ .early.coverage
.ipynb_checkpoints/ingest-checkpoint.sh ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -e
3
+ trap 'read -p "Press Enter to exit..."' EXIT
4
+
5
+ # Colors
6
+ RED='\033[0;31m'
7
+ GREEN='\033[0;32m'
8
+ BLUE='\033[0;34m'
9
+ CYAN='\033[0;36m'
10
+ NC='\033[0m'
11
+
12
+ # Helper functions
13
+ header() { echo -e "\n${CYAN}========================================${NC}\n${CYAN} $1${NC}\n${CYAN}========================================${NC}\n"; }
14
+ log() { echo -e "${BLUE}[INFO]${NC} $1"; }
15
+ success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
16
+ error() { echo -e "${RED}[ERROR]${NC} $1"; exit 1; }
17
+
18
+ #===============================================================================
19
+ # Step 5+6: Download, Ingest, Delete (one epoch at a time to save disk)
20
+ #===============================================================================
21
+ header "Step 5-6/7: Processing Epochs (Download → Ingest → Delete)"
22
+
23
+ EPOCHS=(844 845 846)
24
+
25
+
26
+ log "Processing epochs one at a time to minimize disk usage..."
27
+ log "Each epoch: ~20GB download → ingest → delete"
28
+ echo ""
29
+
30
+ for epoch in "${EPOCHS[@]}"; do
31
+ EPOCH_DIR="./data/pump_fun/epoch_${epoch}"
32
+
33
+ log "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
34
+ log "Processing epoch ${epoch}..."
35
+
36
+ # Step 1: Download
37
+ log " [1/3] Downloading epoch ${epoch}..."
38
+ python scripts/download_epoch_artifacts.py --epoch "$epoch" || {
39
+ error "Failed to download epoch ${epoch}. Cannot continue."
40
+ }
41
+
42
+ # Step 2: Ingest (always pass --merge-neo4j; auto-detect handles empty DB)
43
+ log " [2/3] Ingesting epoch ${epoch} into databases..."
44
+ python scripts/ingest_epoch.py --epoch "$epoch" --merge-neo4j || {
45
+ error "Ingestion failed for epoch ${epoch}. Cannot continue."
46
+ }
47
+
48
+ # Step 3: Delete parquet files to free disk space
49
+ log " [3/3] Cleaning up epoch ${epoch} parquet files..."
50
+ rm -rf "$EPOCH_DIR"
51
+
52
+ # Show progress
53
+ CURRENT_MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
54
+ CURRENT_TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
55
+ log " Progress: ${CURRENT_MINTS} mints, ${CURRENT_TRADES} trades"
56
+ log " Disk free: $(df -h . | awk 'NR==2{print $4}')"
57
+ done
58
+
59
+ # Final verification
60
+ log ""
61
+ log "Verifying final data..."
62
+ MINTS=$(clickhouse-client --query "SELECT count() FROM mints" 2>/dev/null || echo "0")
63
+ TRADES=$(clickhouse-client --query "SELECT count() FROM trades" 2>/dev/null || echo "0")
64
+ log " 📊 Mints: ${MINTS}"
65
+ log " 📊 Trades: ${TRADES}"
66
+
67
+ success "All epochs processed and ingested"
.ipynb_checkpoints/pre_cache-checkpoint.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Pre-caches the dataset for training in context mode
3
+ #
4
+ # Usage:
5
+ # ./pre_cache.sh
6
+ #
7
+ # Arguments:
8
+ # --context_length N Max sequence length, triggers H/B/H when exceeded (default: 4096)
9
+ # --min_trades N Minimum trades for T_cutoff sampling (default: 10)
10
+ # --samples_per_token N Number of T_cutoff samples per token (default: 1)
11
+
12
+ # set -e # Removed: was closing terminal on any error
13
+
14
+ # Default values
15
+ CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
16
+ MIN_TRADES="${MIN_TRADES:-10}"
17
+ SAMPLES_PER_TOKEN="${SAMPLES_PER_TOKEN:-1}"
18
+ OHLC_STATS_PATH="${OHLC_STATS_PATH:-/workspace/apollo/data/ohlc_stats.npz}"
19
+ OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
20
+
21
+ echo "========================================"
22
+ echo "Apollo Dataset Pre-Caching (Context Mode)"
23
+ echo "========================================"
24
+ echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
25
+ echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
26
+ echo "Samples per Token: $SAMPLES_PER_TOKEN"
27
+ echo "Output Directory: $OUTPUT_DIR"
28
+ echo "OHLC Stats Path: $OHLC_STATS_PATH"
29
+ echo "========================================"
30
+
31
+ echo "Starting dataset caching..."
32
+
33
+ python3 scripts/cache_dataset.py \
34
+ --ohlc_stats_path "$OHLC_STATS_PATH" \
35
+ --output_dir "$OUTPUT_DIR" \
36
+ --context_length "$CONTEXT_LENGTH" \
37
+ --min_trades "$MIN_TRADES" \
38
+ --samples_per_token "$SAMPLES_PER_TOKEN" \
39
+ "$@"
40
+
41
+ echo "Done!"
42
+ echo "Cache saved to: $OUTPUT_DIR"
data/data_loader.py CHANGED
@@ -757,18 +757,23 @@ class OracleDataset(Dataset):
757
  wallet_addresses = valid_wallets
758
 
759
  # --- Collect all unique mints from holdings, split into top 10 + rest ---
 
 
 
760
  all_holding_mints = set()
761
  top_holding_mints = set()
762
  for wallet_addr in wallet_addresses:
763
  wallet_holds = holdings.get(wallet_addr, [])
764
  for holding_item in wallet_holds:
765
- if 'mint_address' in holding_item:
766
- all_holding_mints.add(holding_item['mint_address'])
 
767
  # Pick top holdings by volume for full image processing
768
  sorted_holds = sorted(wallet_holds, key=lambda h: float(h.get('total_volume_usd', 0) or 0), reverse=True)
769
  for h in sorted_holds[:2]:
770
- if h.get('mint_address'):
771
- top_holding_mints.add(h['mint_address'])
 
772
 
773
  # Cap top mints at 10 for full image processing
774
  top_holding_mints = set(list(top_holding_mints)[:10])
@@ -781,7 +786,12 @@ class OracleDataset(Dataset):
781
  rest_tokens = self._process_token_data_lightweight(list(rest_holding_mints), pooler, T_cutoff) if rest_holding_mints else {}
782
  processed_new_tokens = {**top_tokens, **rest_tokens}
783
  _wd_timings['holding_token_processing'] = _time.perf_counter() - _t0
784
- all_token_data = {**token_data, **(processed_new_tokens or {})}
 
 
 
 
 
785
 
786
  # Print wallet_data sub-timings
787
  print(f" [WALLET_DATA] db_fetch: {_wd_timings['db_fetch']*1000:.1f}ms, "
@@ -876,12 +886,41 @@ class OracleDataset(Dataset):
876
  else:
877
  holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
878
 
879
- valid_wallet_holdings.append(holding_item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880
 
 
 
 
 
 
 
 
 
 
 
 
 
 
881
 
882
  final_wallets[addr] = {
883
- 'profile': profile_data,
884
- 'socials': social_data,
885
  'holdings': valid_wallet_holdings
886
  }
887
 
@@ -968,6 +1007,7 @@ class OracleDataset(Dataset):
968
  data['image_emb_idx'] = pooler.get_idx(image)
969
  data['name_emb_idx'] = pooler.get_idx(token_name)
970
  data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
 
971
 
972
  # FIX: Validate the protocol ID ---
973
  # The DB might return an ID that is out of bounds for our nn.Embedding layer.
@@ -1109,8 +1149,19 @@ class OracleDataset(Dataset):
1109
  if not cached_data:
1110
  raise RuntimeError(f"No data loaded for index {idx}")
1111
 
1112
- # Auto-detect cache mode
1113
- cache_mode = cached_data.get('cache_mode', 'raw')
 
 
 
 
 
 
 
 
 
 
 
1114
 
1115
  if cache_mode == 'context':
1116
  # CONTEXT MODE: Return pre-computed training context directly
@@ -1315,15 +1366,15 @@ class OracleDataset(Dataset):
1315
  # --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
1316
  _t0 = _time.perf_counter()
1317
 
1318
- # Build token data from cache (no HTTP calls)
1319
- offline_token_data = {token_address: raw_data}
1320
 
1321
  # If we have cached image bytes, convert to PIL Image for the pooler
1322
  cached_image_bytes = raw_data.get('cached_image_bytes')
1323
  if cached_image_bytes:
1324
  try:
1325
  cached_image = Image.open(BytesIO(cached_image_bytes))
1326
- raw_data['_cached_image_pil'] = cached_image # Store for _process_token_data
1327
  except Exception as e:
1328
  pass # Image decoding failed, will use None
1329
 
@@ -1458,6 +1509,8 @@ class OracleDataset(Dataset):
1458
  data['image_emb_idx'] = pooler.get_idx(image)
1459
  data['name_emb_idx'] = pooler.get_idx(token_name)
1460
  data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
 
 
1461
 
1462
  # Validate protocol ID
1463
  raw_protocol_id = data.get('protocol')
@@ -1470,6 +1523,23 @@ class OracleDataset(Dataset):
1470
 
1471
  return valid_token_data
1472
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1473
  def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
1474
  """
1475
  Fetches cutoff-agnostic raw token data for caching/online sampling.
@@ -2768,8 +2838,8 @@ class OracleDataset(Dataset):
2768
  # Build offline data for this context
2769
  pooler = EmbeddingPooler()
2770
 
2771
- # Process token data offline
2772
- offline_token_data = {token_address: raw_data.copy()}
2773
  if cached_image_bytes:
2774
  try:
2775
  cached_image = Image.open(BytesIO(cached_image_bytes))
@@ -2832,9 +2902,6 @@ class OracleDataset(Dataset):
2832
  )
2833
 
2834
  if result is not None:
2835
- # Store the T_cutoff used for this sample (for reproducibility tracking)
2836
- result['cached_t_cutoff_ts'] = sample_offset_ts
2837
- result['cached_sample_num'] = sample_num
2838
  results.append(result)
2839
  pass # Per-context verbose logging removed for caching speed
2840
 
 
757
  wallet_addresses = valid_wallets
758
 
759
  # --- Collect all unique mints from holdings, split into top 10 + rest ---
760
+ # Preserve seed token metadata (main token from mint record) and avoid refetching it
761
+ # from holdings/token snapshots, which may be sparse at early cutoffs.
762
+ seed_token_addresses = set(token_data.keys())
763
  all_holding_mints = set()
764
  top_holding_mints = set()
765
  for wallet_addr in wallet_addresses:
766
  wallet_holds = holdings.get(wallet_addr, [])
767
  for holding_item in wallet_holds:
768
+ mint_addr = holding_item.get('mint_address')
769
+ if mint_addr and mint_addr not in seed_token_addresses:
770
+ all_holding_mints.add(mint_addr)
771
  # Pick top holdings by volume for full image processing
772
  sorted_holds = sorted(wallet_holds, key=lambda h: float(h.get('total_volume_usd', 0) or 0), reverse=True)
773
  for h in sorted_holds[:2]:
774
+ mint_addr = h.get('mint_address')
775
+ if mint_addr and mint_addr not in seed_token_addresses:
776
+ top_holding_mints.add(mint_addr)
777
 
778
  # Cap top mints at 10 for full image processing
779
  top_holding_mints = set(list(top_holding_mints)[:10])
 
786
  rest_tokens = self._process_token_data_lightweight(list(rest_holding_mints), pooler, T_cutoff) if rest_holding_mints else {}
787
  processed_new_tokens = {**top_tokens, **rest_tokens}
788
  _wd_timings['holding_token_processing'] = _time.perf_counter() - _t0
789
+ # Defensive merge: never overwrite seed token metadata with holding-token fetches.
790
+ all_token_data = dict(token_data)
791
+ for addr, data in (processed_new_tokens or {}).items():
792
+ if addr in all_token_data:
793
+ continue
794
+ all_token_data[addr] = data
795
 
796
  # Print wallet_data sub-timings
797
  print(f" [WALLET_DATA] db_fetch: {_wd_timings['db_fetch']*1000:.1f}ms, "
 
886
  else:
887
  holding_item['bought_amount_sol_pct_to_native_balance'] = 0.0
888
 
889
+ # Keep only fields used by WalletEncoder to minimize cache size.
890
+ compact_holding = {
891
+ 'mint_address': mint_addr,
892
+ 'holding_time': float(holding_item.get('holding_time', 0.0) or 0.0),
893
+ 'balance_pct_to_supply': float(holding_item.get('balance_pct_to_supply', 0.0) or 0.0),
894
+ 'history_bought_cost_sol': float(holding_item.get('history_bought_cost_sol', 0.0) or 0.0),
895
+ 'bought_amount_sol_pct_to_native_balance': float(holding_item.get('bought_amount_sol_pct_to_native_balance', 0.0) or 0.0),
896
+ 'history_total_buys': float(holding_item.get('history_total_buys', 0.0) or 0.0),
897
+ 'history_total_sells': float(holding_item.get('history_total_sells', 0.0) or 0.0),
898
+ 'realized_profit_pnl': float(holding_item.get('realized_profit_pnl', 0.0) or 0.0),
899
+ 'realized_profit_sol': float(holding_item.get('realized_profit_sol', 0.0) or 0.0),
900
+ 'history_transfer_in': float(holding_item.get('history_transfer_in', 0.0) or 0.0),
901
+ 'history_transfer_out': float(holding_item.get('history_transfer_out', 0.0) or 0.0),
902
+ 'avarage_trade_gap_seconds': float(holding_item.get('avarage_trade_gap_seconds', 0.0) or 0.0),
903
+ 'total_fees': float(holding_item.get('total_fees', 0.0) or 0.0),
904
+ }
905
+ valid_wallet_holdings.append(compact_holding)
906
 
907
+ # Keep only fields consumed by WalletEncoder.
908
+ compact_profile = {'wallet_address': addr}
909
+ for key in expected_profile_keys:
910
+ compact_profile[key] = float(profile_data.get(key, 0.0) or 0.0)
911
+ compact_profile['age'] = float(profile_data.get('age', 0.0) or 0.0)
912
+
913
+ compact_social = {
914
+ 'has_pf_profile': bool(social_data.get('has_pf_profile', False)),
915
+ 'has_twitter': bool(social_data.get('has_twitter', False)),
916
+ 'has_telegram': bool(social_data.get('has_telegram', False)),
917
+ 'is_exchange_wallet': bool(social_data.get('is_exchange_wallet', False)),
918
+ 'username_emb_idx': int(social_data.get('username_emb_idx', 0) or 0),
919
+ }
920
 
921
  final_wallets[addr] = {
922
+ 'profile': compact_profile,
923
+ 'socials': compact_social,
924
  'holdings': valid_wallet_holdings
925
  }
926
 
 
1007
  data['image_emb_idx'] = pooler.get_idx(image)
1008
  data['name_emb_idx'] = pooler.get_idx(token_name)
1009
  data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
1010
+ data.pop('_cached_image_pil', None)
1011
 
1012
  # FIX: Validate the protocol ID ---
1013
  # The DB might return an ID that is out of bounds for our nn.Embedding layer.
 
1149
  if not cached_data:
1150
  raise RuntimeError(f"No data loaded for index {idx}")
1151
 
1152
+ # Auto-detect cache mode. New compact context cache may omit 'cache_mode'.
1153
+ if 'cache_mode' in cached_data:
1154
+ cache_mode = cached_data.get('cache_mode', 'raw')
1155
+ else:
1156
+ has_context_shape = (
1157
+ isinstance(cached_data, dict) and
1158
+ 'event_sequence' in cached_data and
1159
+ 'tokens' in cached_data and
1160
+ 'wallets' in cached_data and
1161
+ 'labels' in cached_data and
1162
+ 'labels_mask' in cached_data
1163
+ )
1164
+ cache_mode = 'context' if has_context_shape else 'raw'
1165
 
1166
  if cache_mode == 'context':
1167
  # CONTEXT MODE: Return pre-computed training context directly
 
1366
  # --- TIMING: Token data (OFFLINE - uses cached image bytes) ---
1367
  _t0 = _time.perf_counter()
1368
 
1369
+ # Build minimal main token metadata from cache (no HTTP calls)
1370
+ offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
1371
 
1372
  # If we have cached image bytes, convert to PIL Image for the pooler
1373
  cached_image_bytes = raw_data.get('cached_image_bytes')
1374
  if cached_image_bytes:
1375
  try:
1376
  cached_image = Image.open(BytesIO(cached_image_bytes))
1377
+ offline_token_data[token_address]['_cached_image_pil'] = cached_image
1378
  except Exception as e:
1379
  pass # Image decoding failed, will use None
1380
 
 
1509
  data['image_emb_idx'] = pooler.get_idx(image)
1510
  data['name_emb_idx'] = pooler.get_idx(token_name)
1511
  data['symbol_emb_idx'] = pooler.get_idx(token_symbol)
1512
+ # Drop transient in-memory image object from cache payload.
1513
+ data.pop('_cached_image_pil', None)
1514
 
1515
  # Validate protocol ID
1516
  raw_protocol_id = data.get('protocol')
 
1523
 
1524
  return valid_token_data
1525
 
1526
+ def _build_main_token_seed(self, token_address: str, raw_data: Dict[str, Any]) -> Dict[str, Any]:
1527
+ """
1528
+ Build a minimal token metadata payload for the main token.
1529
+ Prevents raw cache blobs (trades/snapshots/etc.) from leaking into
1530
+ sample['tokens'][main_token].
1531
+ """
1532
+ return {
1533
+ 'token_address': token_address,
1534
+ 'address': token_address,
1535
+ 'name': raw_data.get('name', ''),
1536
+ 'symbol': raw_data.get('symbol', ''),
1537
+ 'token_uri': raw_data.get('token_uri', ''),
1538
+ 'protocol': raw_data.get('protocol', 1),
1539
+ 'total_supply': raw_data.get('total_supply', 0),
1540
+ 'decimals': raw_data.get('decimals', 6),
1541
+ }
1542
+
1543
  def __cacheitem__(self, idx: int) -> Optional[Dict[str, Any]]:
1544
  """
1545
  Fetches cutoff-agnostic raw token data for caching/online sampling.
 
2838
  # Build offline data for this context
2839
  pooler = EmbeddingPooler()
2840
 
2841
+ # Process token data offline (minimal main token metadata only)
2842
+ offline_token_data = {token_address: self._build_main_token_seed(token_address, raw_data)}
2843
  if cached_image_bytes:
2844
  try:
2845
  cached_image = Image.open(BytesIO(cached_image_bytes))
 
2902
  )
2903
 
2904
  if result is not None:
 
 
 
2905
  results.append(result)
2906
  pass # Per-context verbose logging removed for caching speed
2907
 
data/ohlc_stats.npz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:460520e03b3be81e290a6356069bfc80e6e3e4870aa5baf74ecb1c33f26bf41c
3
  size 1660
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6de1f1622fbf061842fa4227e2b98784c9cec39ed647f4b87df2ad5eef6e47
3
  size 1660
log.log CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c48dfd4947f51a5bbb11626480fe6b3bf92a0c5d0d1fcd1f3dc482e23201b73
3
- size 645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21f6421b07eb49c1e0a5518a628403ce0ae7149fb81a600aebad2dfcaf0313c9
3
+ size 2854
pre_cache.sh CHANGED
@@ -3,20 +3,24 @@
3
  #
4
  # Usage:
5
  # ./pre_cache.sh
6
- #
7
- # Arguments:
8
- # --context_length N Max sequence length, triggers H/B/H when exceeded (default: 4096)
9
- # --min_trades N Minimum trades for T_cutoff sampling (default: 10)
10
- # --samples_per_token N Number of T_cutoff samples per token (default: 1)
11
 
12
- # set -e # Removed: was closing terminal on any error
 
 
 
 
 
 
 
 
 
 
13
 
14
- # Default values
15
- CONTEXT_LENGTH="${CONTEXT_LENGTH:-4096}"
16
- MIN_TRADES="${MIN_TRADES:-10}"
17
- SAMPLES_PER_TOKEN="${SAMPLES_PER_TOKEN:-1}"
18
- OHLC_STATS_PATH="${OHLC_STATS_PATH:-/workspace/apollo/data/ohlc_stats.npz}"
19
- OUTPUT_DIR="${OUTPUT_DIR:-data/cache}"
20
 
21
  echo "========================================"
22
  echo "Apollo Dataset Pre-Caching (Context Mode)"
@@ -24,18 +28,26 @@ echo "========================================"
24
  echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
25
  echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
26
  echo "Samples per Token: $SAMPLES_PER_TOKEN"
 
 
 
27
  echo "Output Directory: $OUTPUT_DIR"
28
  echo "OHLC Stats Path: $OHLC_STATS_PATH"
29
  echo "========================================"
30
 
31
  echo "Starting dataset caching..."
32
 
 
 
33
  python3 scripts/cache_dataset.py \
34
  --ohlc_stats_path "$OHLC_STATS_PATH" \
35
  --output_dir "$OUTPUT_DIR" \
36
  --context_length "$CONTEXT_LENGTH" \
37
  --min_trades "$MIN_TRADES" \
38
  --samples_per_token "$SAMPLES_PER_TOKEN" \
 
 
 
39
  "$@"
40
 
41
  echo "Done!"
 
3
  #
4
  # Usage:
5
  # ./pre_cache.sh
 
 
 
 
 
6
 
7
+ set -euo pipefail
8
+
9
+ # =========================
10
+ # Hardcoded cache settings
11
+ # =========================
12
+ CONTEXT_LENGTH=4096
13
+ MIN_TRADES=10
14
+ SAMPLES_PER_TOKEN=1
15
+ NUM_WORKERS=1
16
+ OHLC_STATS_PATH="/workspace/apollo/data/ohlc_stats.npz"
17
+ OUTPUT_DIR="data/cache"
18
 
19
+ # Label horizons in seconds, relative to each sampled T_cutoff.
20
+ # Tuned for memecoin timing distribution (less micro-noise, more actionable windows):
21
+ # [300, 900, 1800, 3600, 7200] = [5m, 15m, 30m, 60m, 120m]
22
+ HORIZONS_SECONDS=(300 900 1800 3600 7200)
23
+ QUANTILES=(0.1 0.5 0.9)
 
24
 
25
  echo "========================================"
26
  echo "Apollo Dataset Pre-Caching (Context Mode)"
 
28
  echo "Context Length (H/B/H threshold): $CONTEXT_LENGTH"
29
  echo "Min Trades (T_cutoff threshold): $MIN_TRADES"
30
  echo "Samples per Token: $SAMPLES_PER_TOKEN"
31
+ echo "Num Workers: $NUM_WORKERS"
32
+ echo "Horizons (sec): ${HORIZONS_SECONDS[*]}"
33
+ echo "Quantiles: ${QUANTILES[*]}"
34
  echo "Output Directory: $OUTPUT_DIR"
35
  echo "OHLC Stats Path: $OHLC_STATS_PATH"
36
  echo "========================================"
37
 
38
  echo "Starting dataset caching..."
39
 
40
+ mkdir -p "$OUTPUT_DIR"
41
+
42
  python3 scripts/cache_dataset.py \
43
  --ohlc_stats_path "$OHLC_STATS_PATH" \
44
  --output_dir "$OUTPUT_DIR" \
45
  --context_length "$CONTEXT_LENGTH" \
46
  --min_trades "$MIN_TRADES" \
47
  --samples_per_token "$SAMPLES_PER_TOKEN" \
48
+ --num_workers "$NUM_WORKERS" \
49
+ --horizons_seconds "${HORIZONS_SECONDS[@]}" \
50
+ --quantiles "${QUANTILES[@]}" \
51
  "$@"
52
 
53
  echo "Done!"
sample_36zVkFUXEhdKW5Nz_0.json ADDED
The diff for this file is too large to render. See raw diff
 
scripts/cache_dataset.py CHANGED
@@ -73,8 +73,6 @@ def _process_single_token_context(args):
73
  for ctx_idx, ctx in enumerate(contexts):
74
  ctx["quality_score"] = q_score
75
  ctx["class_id"] = class_id
76
- ctx["source_token"] = mint_addr
77
- ctx["cache_mode"] = "context"
78
  filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
79
  output_path = Path(output_dir) / filename
80
  torch.save(ctx, output_path)
 
73
  for ctx_idx, ctx in enumerate(contexts):
74
  ctx["quality_score"] = q_score
75
  ctx["class_id"] = class_id
 
 
76
  filename = f"sample_{mint_addr[:16]}_{ctx_idx}.pt"
77
  output_path = Path(output_dir) / filename
78
  torch.save(ctx, output_path)