File size: 3,339 Bytes
858826c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf92148
858826c
 
bf92148
 
 
858826c
 
 
 
 
9dd732c
bf92148
858826c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf92148
858826c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf92148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
858826c
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
"""
Download a specific epoch's parquet/Neo4j artifacts from Hugging Face.

Usage:
  HF_TOKEN=your_token \
    python scripts/download_epoch_artifacts.py --epoch 851
"""

import argparse
import os
from pathlib import Path
from typing import List

from huggingface_hub import snapshot_download


REPO_ID = "zirobtc/pump-fun-dataset"
REPO_TYPE = "model"  # dataset is not used here per user note
DEFAULT_DEST_DIR = "./data/pump_fun"

# File stems that are suffixed with `_epoch_{epoch}.parquet`
PARQUET_STEMS = [
    "wallet_profiles",
    "wallet_holdings",
    "trades",
    "transfers",
    "burns",
    "tokens",
    "mints",
    "liquidity",
    "pool_creations",
    "token_metrics",
    "wallet_profile_metrics",
    "migrations",
    "fee_collections",
    "supply_locks",
    "supply_lock_actions",
    "known_wallets",
]

# Single Neo4j dump name
NEO4J_FILENAME = "neo4j_epoch_{epoch}.dump"


def build_patterns(epoch: int, skip_clickhouse: bool = False) -> List[str]:
    epoch_str = str(epoch)
    neo4j_pattern = NEO4J_FILENAME.format(epoch=epoch_str)
    if skip_clickhouse:
        return [neo4j_pattern]
    parquet_patterns = [f"{stem}_epoch_{epoch_str}.parquet" for stem in PARQUET_STEMS]
    return parquet_patterns + [neo4j_pattern]


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.")
    parser.add_argument("--epoch", type=int,  required=False, help="Epoch number to download (e.g., 844)", default=844)
    parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Download only the Neo4j dump")
    parser.add_argument(
        "--token",
        type=str,
        default=None,
        required=False,
        help="Hugging Face token (or set HF_TOKEN env var)",
    )
    
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    token = args.token or os.environ.get("HF_TOKEN")


    patterns = build_patterns(args.epoch, skip_clickhouse=args.skip_clickhouse)
    dest_root = Path(DEFAULT_DEST_DIR).expanduser()
    dest_dir = dest_root / f"epoch_{args.epoch}"
    dest_dir.mkdir(parents=True, exist_ok=True)

    print(f"Downloading epoch {args.epoch} files from {REPO_ID} to {dest_dir}")
    print("Files:")
    for p in patterns:
        print(f"  - {p}")

    snapshot_download(
        repo_id=REPO_ID,
        repo_type=REPO_TYPE,
        local_dir=str(dest_dir),
        local_dir_use_symlinks=False,
        allow_patterns=patterns,
        resume_download=True,
        token=token,
    )

    # --- New: Download wallet_socials from zirobtc/memes ---
    SOCIAL_REPO_ID = "zirobtc/memes"
    SOCIAL_FILES = [
        "wallet_socials_1763057853.parquet",
        "wallet_socials_2.parquet",
        "wallet_socials_3.parquet",
    ]
    
    social_dest_dir = dest_root / "socials"
    social_dest_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Downloading social artifacts from {SOCIAL_REPO_ID} to {social_dest_dir}")
    snapshot_download(
        repo_id=SOCIAL_REPO_ID,
        repo_type="dataset",
        local_dir=str(social_dest_dir),
        local_dir_use_symlinks=False,
        allow_patterns=SOCIAL_FILES,
        resume_download=True,
        token=token,
    )

    print("Download complete.")


if __name__ == "__main__":
    main()