File size: 5,862 Bytes

e605733

#!/usr/bin/env python3
"""
Cache Validation Script: Verify cache files have complete offline data.

Usage:
    python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100
"""

import argparse
import torch
from pathlib import Path
import random
from collections import Counter


def validate_cache_file(filepath):
    """Validate a single cache file has all required fields."""
    try:
        data = torch.load(filepath, map_location='cpu', weights_only=False)

        issues = []

        # Required base fields
        base_fields = [
            "mint_timestamp", "token_address", "creator_address",
            "trades", "transfers", "quality_score", "ohlc_1s"
        ]
        for field in base_fields:
            if field not in data:
                issues.append(f"Missing base field: {field}")

        # New v2 fields for offline mode
        if 'cached_wallet_data' not in data:
            issues.append("Missing cached_wallet_data (v2)")
        else:
            wallet_data = data['cached_wallet_data']
            if 'profiles' not in wallet_data:
                issues.append("Missing cached_wallet_data.profiles")
            if 'socials' not in wallet_data:
                issues.append("Missing cached_wallet_data.socials")
            if 'holdings' not in wallet_data:
                issues.append("Missing cached_wallet_data.holdings")

        if 'cached_graph_data' not in data:
            issues.append("Missing cached_graph_data (v2)")
        else:
            graph_data = data['cached_graph_data']
            if 'entities' not in graph_data:
                issues.append("Missing cached_graph_data.entities")
            if 'links' not in graph_data:
                issues.append("Missing cached_graph_data.links")

        # Image is optional but good to have
        has_image = data.get('cached_image_bytes') is not None

        # Collect stats
        stats = {
            'num_trades': len(data.get('trades', [])),
            'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})),
            'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})),
            'has_image': has_image,
            'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None,
        }

        return {
            'valid': len(issues) == 0,
            'issues': issues,
            'stats': stats,
            'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data
        }

    except Exception as e:
        return {
            'valid': False,
            'issues': [f"Load error: {str(e)}"],
            'stats': {},
            'is_v2': False
        }


def main():
    parser = argparse.ArgumentParser(description="Validate cache files for offline training")
    parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory")
    parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample")
    parser.add_argument("--full", action="store_true", help="Check all files (slow)")
    args = parser.parse_args()

    cache_dir = Path(args.cache_dir)
    if not cache_dir.exists():
        print(f"ERROR: Cache directory not found: {cache_dir}")
        return

    cache_files = list(cache_dir.glob("sample_*.pt"))
    print(f"Found {len(cache_files)} cache files")

    if not args.full and len(cache_files) > args.sample_size:
        cache_files = random.sample(cache_files, args.sample_size)
        print(f"Sampling {len(cache_files)} files for validation")

    # Validate files
    results = []
    for f in cache_files:
        result = validate_cache_file(f)
        result['filepath'] = f
        results.append(result)

    # Summary
    valid_count = sum(1 for r in results if r['valid'])
    v2_count = sum(1 for r in results if r['is_v2'])
    has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False))

    print("\n" + "="*60)
    print("VALIDATION SUMMARY")
    print("="*60)
    print(f"  Total checked: {len(results)}")
    print(f"  Valid: {valid_count} ({100*valid_count/len(results):.1f}%)")
    print(f"  V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)")
    print(f"  Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)")

    # Issue breakdown
    all_issues = []
    for r in results:
        all_issues.extend(r['issues'])

    if all_issues:
        print("\nIssue breakdown:")
        for issue, count in Counter(all_issues).most_common():
            print(f"  {issue}: {count}")

    # Stats for valid v2 files
    v2_results = [r for r in results if r['is_v2']]
    if v2_results:
        avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results)
        avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results)
        print(f"\nV2 file stats:")
        print(f"  Avg wallets cached: {avg_wallets:.1f}")
        print(f"  Avg trades: {avg_trades:.1f}")

    # Show sample invalid files
    invalid_results = [r for r in results if not r['valid']]
    if invalid_results:
        print("\nSample invalid files:")
        for r in invalid_results[:5]:
            print(f"  {r['filepath'].name}: {r['issues'][:2]}")

    # Recommendation
    print("\n" + "="*60)
    if v2_count == len(results):
        print("All files are V2 format. Ready for offline training!")
    elif v2_count == 0:
        print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir))
    else:
        print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.")
        print("Old format files will use empty wallet/graph data during training.")
        print("Run migrate_cache_v2.py to upgrade remaining files.")


if __name__ == "__main__":
    main()