File size: 5,862 Bytes
e605733 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | #!/usr/bin/env python3
"""
Cache Validation Script: Verify cache files have complete offline data.
Usage:
python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100
"""
import argparse
import torch
from pathlib import Path
import random
from collections import Counter
def validate_cache_file(filepath):
"""Validate a single cache file has all required fields."""
try:
data = torch.load(filepath, map_location='cpu', weights_only=False)
issues = []
# Required base fields
base_fields = [
"mint_timestamp", "token_address", "creator_address",
"trades", "transfers", "quality_score", "ohlc_1s"
]
for field in base_fields:
if field not in data:
issues.append(f"Missing base field: {field}")
# New v2 fields for offline mode
if 'cached_wallet_data' not in data:
issues.append("Missing cached_wallet_data (v2)")
else:
wallet_data = data['cached_wallet_data']
if 'profiles' not in wallet_data:
issues.append("Missing cached_wallet_data.profiles")
if 'socials' not in wallet_data:
issues.append("Missing cached_wallet_data.socials")
if 'holdings' not in wallet_data:
issues.append("Missing cached_wallet_data.holdings")
if 'cached_graph_data' not in data:
issues.append("Missing cached_graph_data (v2)")
else:
graph_data = data['cached_graph_data']
if 'entities' not in graph_data:
issues.append("Missing cached_graph_data.entities")
if 'links' not in graph_data:
issues.append("Missing cached_graph_data.links")
# Image is optional but good to have
has_image = data.get('cached_image_bytes') is not None
# Collect stats
stats = {
'num_trades': len(data.get('trades', [])),
'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})),
'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})),
'has_image': has_image,
'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None,
}
return {
'valid': len(issues) == 0,
'issues': issues,
'stats': stats,
'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data
}
except Exception as e:
return {
'valid': False,
'issues': [f"Load error: {str(e)}"],
'stats': {},
'is_v2': False
}
def main():
parser = argparse.ArgumentParser(description="Validate cache files for offline training")
parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory")
parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample")
parser.add_argument("--full", action="store_true", help="Check all files (slow)")
args = parser.parse_args()
cache_dir = Path(args.cache_dir)
if not cache_dir.exists():
print(f"ERROR: Cache directory not found: {cache_dir}")
return
cache_files = list(cache_dir.glob("sample_*.pt"))
print(f"Found {len(cache_files)} cache files")
if not args.full and len(cache_files) > args.sample_size:
cache_files = random.sample(cache_files, args.sample_size)
print(f"Sampling {len(cache_files)} files for validation")
# Validate files
results = []
for f in cache_files:
result = validate_cache_file(f)
result['filepath'] = f
results.append(result)
# Summary
valid_count = sum(1 for r in results if r['valid'])
v2_count = sum(1 for r in results if r['is_v2'])
has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False))
print("\n" + "="*60)
print("VALIDATION SUMMARY")
print("="*60)
print(f" Total checked: {len(results)}")
print(f" Valid: {valid_count} ({100*valid_count/len(results):.1f}%)")
print(f" V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)")
print(f" Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)")
# Issue breakdown
all_issues = []
for r in results:
all_issues.extend(r['issues'])
if all_issues:
print("\nIssue breakdown:")
for issue, count in Counter(all_issues).most_common():
print(f" {issue}: {count}")
# Stats for valid v2 files
v2_results = [r for r in results if r['is_v2']]
if v2_results:
avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results)
avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results)
print(f"\nV2 file stats:")
print(f" Avg wallets cached: {avg_wallets:.1f}")
print(f" Avg trades: {avg_trades:.1f}")
# Show sample invalid files
invalid_results = [r for r in results if not r['valid']]
if invalid_results:
print("\nSample invalid files:")
for r in invalid_results[:5]:
print(f" {r['filepath'].name}: {r['issues'][:2]}")
# Recommendation
print("\n" + "="*60)
if v2_count == len(results):
print("All files are V2 format. Ready for offline training!")
elif v2_count == 0:
print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir))
else:
print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.")
print("Old format files will use empty wallet/graph data during training.")
print("Run migrate_cache_v2.py to upgrade remaining files.")
if __name__ == "__main__":
main()
|