| |
| """ |
| Download a specific epoch's parquet/Neo4j artifacts from Hugging Face. |
| |
| Usage: |
| HF_TOKEN=your_token \ |
| python scripts/download_epoch_artifacts.py --epoch 851 |
| """ |
|
|
| import argparse |
| import os |
| from pathlib import Path |
| from typing import List |
|
|
| from huggingface_hub import snapshot_download |
|
|
|
|
| REPO_ID = "zirobtc/pump-fun-dataset" |
| REPO_TYPE = "model" |
| DEFAULT_DEST_DIR = "./data/pump_fun" |
|
|
| |
| PARQUET_STEMS = [ |
| "wallet_profiles", |
| "wallet_holdings", |
| "trades", |
| "transfers", |
| "burns", |
| "tokens", |
| "mints", |
| "liquidity", |
| "pool_creations", |
| "token_metrics", |
| "wallet_profile_metrics", |
| "migrations", |
| "fee_collections", |
| "supply_locks", |
| "supply_lock_actions", |
| "known_wallets", |
| ] |
|
|
| |
| NEO4J_FILENAME = "neo4j_epoch_{epoch}.dump" |
|
|
|
|
| def build_patterns(epoch: int, skip_clickhouse: bool = False) -> List[str]: |
| epoch_str = str(epoch) |
| neo4j_pattern = NEO4J_FILENAME.format(epoch=epoch_str) |
| if skip_clickhouse: |
| return [neo4j_pattern] |
| parquet_patterns = [f"{stem}_epoch_{epoch_str}.parquet" for stem in PARQUET_STEMS] |
| return parquet_patterns + [neo4j_pattern] |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Download epoch artifacts from Hugging Face.") |
| parser.add_argument("--epoch", type=int, required=False, help="Epoch number to download (e.g., 844)", default=844) |
| parser.add_argument("-c", "--skip-clickhouse", action="store_true", help="Download only the Neo4j dump") |
| parser.add_argument( |
| "--token", |
| type=str, |
| default=None, |
| required=False, |
| help="Hugging Face token (or set HF_TOKEN env var)", |
| ) |
| |
| return parser.parse_args() |
|
|
|
|
| def main() -> None: |
| args = parse_args() |
| token = args.token or os.environ.get("HF_TOKEN") |
|
|
|
|
| patterns = build_patterns(args.epoch, skip_clickhouse=args.skip_clickhouse) |
| dest_root = Path(DEFAULT_DEST_DIR).expanduser() |
| dest_dir = dest_root / f"epoch_{args.epoch}" |
| dest_dir.mkdir(parents=True, exist_ok=True) |
|
|
| print(f"Downloading epoch {args.epoch} files from {REPO_ID} to {dest_dir}") |
| print("Files:") |
| for p in patterns: |
| print(f" - {p}") |
|
|
| snapshot_download( |
| repo_id=REPO_ID, |
| repo_type=REPO_TYPE, |
| local_dir=str(dest_dir), |
| local_dir_use_symlinks=False, |
| allow_patterns=patterns, |
| resume_download=True, |
| token=token, |
| ) |
|
|
| |
| SOCIAL_REPO_ID = "zirobtc/memes" |
| SOCIAL_FILES = [ |
| "wallet_socials_1763057853.parquet", |
| "wallet_socials_2.parquet", |
| "wallet_socials_3.parquet", |
| ] |
| |
| social_dest_dir = dest_root / "socials" |
| social_dest_dir.mkdir(parents=True, exist_ok=True) |
| |
| print(f"Downloading social artifacts from {SOCIAL_REPO_ID} to {social_dest_dir}") |
| snapshot_download( |
| repo_id=SOCIAL_REPO_ID, |
| repo_type="dataset", |
| local_dir=str(social_dest_dir), |
| local_dir_use_symlinks=False, |
| allow_patterns=SOCIAL_FILES, |
| resume_download=True, |
| token=token, |
| ) |
|
|
| print("Download complete.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|