#!/usr/bin/env python3 """ Cache Validation Script: Verify cache files have complete offline data. Usage: python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100 """ import argparse import torch from pathlib import Path import random from collections import Counter def validate_cache_file(filepath): """Validate a single cache file has all required fields.""" try: data = torch.load(filepath, map_location='cpu', weights_only=False) issues = [] # Required base fields base_fields = [ "mint_timestamp", "token_address", "creator_address", "trades", "transfers", "quality_score", "ohlc_1s" ] for field in base_fields: if field not in data: issues.append(f"Missing base field: {field}") # New v2 fields for offline mode if 'cached_wallet_data' not in data: issues.append("Missing cached_wallet_data (v2)") else: wallet_data = data['cached_wallet_data'] if 'profiles' not in wallet_data: issues.append("Missing cached_wallet_data.profiles") if 'socials' not in wallet_data: issues.append("Missing cached_wallet_data.socials") if 'holdings' not in wallet_data: issues.append("Missing cached_wallet_data.holdings") if 'cached_graph_data' not in data: issues.append("Missing cached_graph_data (v2)") else: graph_data = data['cached_graph_data'] if 'entities' not in graph_data: issues.append("Missing cached_graph_data.entities") if 'links' not in graph_data: issues.append("Missing cached_graph_data.links") # Image is optional but good to have has_image = data.get('cached_image_bytes') is not None # Collect stats stats = { 'num_trades': len(data.get('trades', [])), 'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})), 'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})), 'has_image': has_image, 'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None, } return { 'valid': len(issues) == 0, 'issues': issues, 'stats': stats, 'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data } except Exception as e: return { 'valid': False, 'issues': [f"Load error: {str(e)}"], 'stats': {}, 'is_v2': False } def main(): parser = argparse.ArgumentParser(description="Validate cache files for offline training") parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory") parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample") parser.add_argument("--full", action="store_true", help="Check all files (slow)") args = parser.parse_args() cache_dir = Path(args.cache_dir) if not cache_dir.exists(): print(f"ERROR: Cache directory not found: {cache_dir}") return cache_files = list(cache_dir.glob("sample_*.pt")) print(f"Found {len(cache_files)} cache files") if not args.full and len(cache_files) > args.sample_size: cache_files = random.sample(cache_files, args.sample_size) print(f"Sampling {len(cache_files)} files for validation") # Validate files results = [] for f in cache_files: result = validate_cache_file(f) result['filepath'] = f results.append(result) # Summary valid_count = sum(1 for r in results if r['valid']) v2_count = sum(1 for r in results if r['is_v2']) has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False)) print("\n" + "="*60) print("VALIDATION SUMMARY") print("="*60) print(f" Total checked: {len(results)}") print(f" Valid: {valid_count} ({100*valid_count/len(results):.1f}%)") print(f" V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)") print(f" Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)") # Issue breakdown all_issues = [] for r in results: all_issues.extend(r['issues']) if all_issues: print("\nIssue breakdown:") for issue, count in Counter(all_issues).most_common(): print(f" {issue}: {count}") # Stats for valid v2 files v2_results = [r for r in results if r['is_v2']] if v2_results: avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results) avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results) print(f"\nV2 file stats:") print(f" Avg wallets cached: {avg_wallets:.1f}") print(f" Avg trades: {avg_trades:.1f}") # Show sample invalid files invalid_results = [r for r in results if not r['valid']] if invalid_results: print("\nSample invalid files:") for r in invalid_results[:5]: print(f" {r['filepath'].name}: {r['issues'][:2]}") # Recommendation print("\n" + "="*60) if v2_count == len(results): print("All files are V2 format. Ready for offline training!") elif v2_count == 0: print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir)) else: print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.") print("Old format files will use empty wallet/graph data during training.") print("Run migrate_cache_v2.py to upgrade remaining files.") if __name__ == "__main__": main()