oracle / scripts /download_epoch_artifacts.py
zirobtc's picture
Upload folder using huggingface_hub
9dd732c
#!/usr/bin/env python3
"""
Download a specific epoch's parquet/Neo4j artifacts from Hugging Face.
Usage:
HF_TOKEN=your_token \
python scripts/download_epoch_artifacts.py --epoch 851
"""
import argparse
import os
from pathlib import Path
from typing import List
from huggingface_hub import snapshot_download
REPO_ID = "zirobtc/pump-fun-dataset"
REPO_TYPE = "model" # dataset is not used here per user note
DEFAULT_DEST_DIR = "./data/pump_fun"
# File stems that are suffixed with `_epoch_{epoch}.parquet`
PARQUET_STEMS = [
"wallet_profiles",
"wallet_holdings",
"trades",
"transfers",
"burns",
"tokens",
"mints",
"liquidity",
"pool_creations",
"token_metrics",
"wallet_profile_metrics",
"migrations",
"fee_collections",
"supply_locks",
"supply_lock_actions",
"known_wallets",
]
# Single Neo4j dump name
NEO4J_FILENAME = "neo4j_epoch_{epoch}.dump"
def build_patterns(epoch: int, skip_clickhouse: bool = False) -> List[str]:
epoch_str = str(epoch)
neo4j_pattern = NEO4J_FILENAME.format(epoch=epoch_str)
if skip_clickhouse:
return [neo4j_pattern]
parquet_patterns = [f"{stem}_epoch_{epoch_str}.parquet" for stem in PARQUET_STEMS]
return parquet_patterns + [neo4j_pattern]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.")
parser.add_argument("--epoch", type=int, required=False, help="Epoch number to download (e.g., 844)", default=844)
parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Download only the Neo4j dump")
parser.add_argument(
"--token",
type=str,
default=None,
required=False,
help="Hugging Face token (or set HF_TOKEN env var)",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
token = args.token or os.environ.get("HF_TOKEN")
patterns = build_patterns(args.epoch, skip_clickhouse=args.skip_clickhouse)
dest_root = Path(DEFAULT_DEST_DIR).expanduser()
dest_dir = dest_root / f"epoch_{args.epoch}"
dest_dir.mkdir(parents=True, exist_ok=True)
print(f"Downloading epoch {args.epoch} files from {REPO_ID} to {dest_dir}")
print("Files:")
for p in patterns:
print(f" - {p}")
snapshot_download(
repo_id=REPO_ID,
repo_type=REPO_TYPE,
local_dir=str(dest_dir),
local_dir_use_symlinks=False,
allow_patterns=patterns,
resume_download=True,
token=token,
)
# --- New: Download wallet_socials from zirobtc/memes ---
SOCIAL_REPO_ID = "zirobtc/memes"
SOCIAL_FILES = [
"wallet_socials_1763057853.parquet",
"wallet_socials_2.parquet",
"wallet_socials_3.parquet",
]
social_dest_dir = dest_root / "socials"
social_dest_dir.mkdir(parents=True, exist_ok=True)
print(f"Downloading social artifacts from {SOCIAL_REPO_ID} to {social_dest_dir}")
snapshot_download(
repo_id=SOCIAL_REPO_ID,
repo_type="dataset",
local_dir=str(social_dest_dir),
local_dir_use_symlinks=False,
allow_patterns=SOCIAL_FILES,
resume_download=True,
token=token,
)
print("Download complete.")
if __name__ == "__main__":
main()