#!/usr/bin/env python3 """ Download a specific epoch's parquet/Neo4j artifacts from Hugging Face. Usage: HF_TOKEN=your_token \ python scripts/download_epoch_artifacts.py --epoch 851 """ import argparse import os from pathlib import Path from typing import List from huggingface_hub import snapshot_download REPO_ID = "zirobtc/pump-fun-dataset" REPO_TYPE = "model" # dataset is not used here per user note DEFAULT_DEST_DIR = "./data/pump_fun" # File stems that are suffixed with `_epoch_{epoch}.parquet` PARQUET_STEMS = [ "wallet_profiles", "wallet_holdings", "trades", "transfers", "burns", "tokens", "mints", "liquidity", "pool_creations", "token_metrics", "wallet_profile_metrics", "migrations", "fee_collections", "supply_locks", "supply_lock_actions", "known_wallets", ] # Single Neo4j dump name NEO4J_FILENAME = "neo4j_epoch_{epoch}.dump" def build_patterns(epoch: int, skip_clickhouse: bool = False) -> List[str]: epoch_str = str(epoch) neo4j_pattern = NEO4J_FILENAME.format(epoch=epoch_str) if skip_clickhouse: return [neo4j_pattern] parquet_patterns = [f"{stem}_epoch_{epoch_str}.parquet" for stem in PARQUET_STEMS] return parquet_patterns + [neo4j_pattern] def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.") parser.add_argument("--epoch", type=int, required=False, help="Epoch number to download (e.g., 844)", default=844) parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Download only the Neo4j dump") parser.add_argument( "--token", type=str, default=None, required=False, help="Hugging Face token (or set HF_TOKEN env var)", ) return parser.parse_args() def main() -> None: args = parse_args() token = args.token or os.environ.get("HF_TOKEN") patterns = build_patterns(args.epoch, skip_clickhouse=args.skip_clickhouse) dest_root = Path(DEFAULT_DEST_DIR).expanduser() dest_dir = dest_root / f"epoch_{args.epoch}" dest_dir.mkdir(parents=True, exist_ok=True) print(f"Downloading epoch {args.epoch} files from {REPO_ID} to {dest_dir}") print("Files:") for p in patterns: print(f" - {p}") snapshot_download( repo_id=REPO_ID, repo_type=REPO_TYPE, local_dir=str(dest_dir), local_dir_use_symlinks=False, allow_patterns=patterns, resume_download=True, token=token, ) # --- New: Download wallet_socials from zirobtc/memes --- SOCIAL_REPO_ID = "zirobtc/memes" SOCIAL_FILES = [ "wallet_socials_1763057853.parquet", "wallet_socials_2.parquet", "wallet_socials_3.parquet", ] social_dest_dir = dest_root / "socials" social_dest_dir.mkdir(parents=True, exist_ok=True) print(f"Downloading social artifacts from {SOCIAL_REPO_ID} to {social_dest_dir}") snapshot_download( repo_id=SOCIAL_REPO_ID, repo_type="dataset", local_dir=str(social_dest_dir), local_dir_use_symlinks=False, allow_patterns=SOCIAL_FILES, resume_download=True, token=token, ) print("Download complete.") if __name__ == "__main__": main()