File size: 5,862 Bytes
e605733
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env python3
"""
Cache Validation Script: Verify cache files have complete offline data.

Usage:
    python scripts/validate_cache_v2.py --cache_dir /workspace/apollo/data/cache --sample_size 100
"""

import argparse
import torch
from pathlib import Path
import random
from collections import Counter


def validate_cache_file(filepath):
    """Validate a single cache file has all required fields."""
    try:
        data = torch.load(filepath, map_location='cpu', weights_only=False)

        issues = []

        # Required base fields
        base_fields = [
            "mint_timestamp", "token_address", "creator_address",
            "trades", "transfers", "quality_score", "ohlc_1s"
        ]
        for field in base_fields:
            if field not in data:
                issues.append(f"Missing base field: {field}")

        # New v2 fields for offline mode
        if 'cached_wallet_data' not in data:
            issues.append("Missing cached_wallet_data (v2)")
        else:
            wallet_data = data['cached_wallet_data']
            if 'profiles' not in wallet_data:
                issues.append("Missing cached_wallet_data.profiles")
            if 'socials' not in wallet_data:
                issues.append("Missing cached_wallet_data.socials")
            if 'holdings' not in wallet_data:
                issues.append("Missing cached_wallet_data.holdings")

        if 'cached_graph_data' not in data:
            issues.append("Missing cached_graph_data (v2)")
        else:
            graph_data = data['cached_graph_data']
            if 'entities' not in graph_data:
                issues.append("Missing cached_graph_data.entities")
            if 'links' not in graph_data:
                issues.append("Missing cached_graph_data.links")

        # Image is optional but good to have
        has_image = data.get('cached_image_bytes') is not None

        # Collect stats
        stats = {
            'num_trades': len(data.get('trades', [])),
            'num_wallets_cached': len(data.get('cached_wallet_data', {}).get('profiles', {})),
            'num_graph_links': len(data.get('cached_graph_data', {}).get('links', {})),
            'has_image': has_image,
            'ohlc_shape': tuple(data['ohlc_1s'].shape) if 'ohlc_1s' in data else None,
        }

        return {
            'valid': len(issues) == 0,
            'issues': issues,
            'stats': stats,
            'is_v2': 'cached_wallet_data' in data and 'cached_graph_data' in data
        }

    except Exception as e:
        return {
            'valid': False,
            'issues': [f"Load error: {str(e)}"],
            'stats': {},
            'is_v2': False
        }


def main():
    parser = argparse.ArgumentParser(description="Validate cache files for offline training")
    parser.add_argument("--cache_dir", type=str, required=True, help="Path to cache directory")
    parser.add_argument("--sample_size", type=int, default=100, help="Number of files to sample")
    parser.add_argument("--full", action="store_true", help="Check all files (slow)")
    args = parser.parse_args()

    cache_dir = Path(args.cache_dir)
    if not cache_dir.exists():
        print(f"ERROR: Cache directory not found: {cache_dir}")
        return

    cache_files = list(cache_dir.glob("sample_*.pt"))
    print(f"Found {len(cache_files)} cache files")

    if not args.full and len(cache_files) > args.sample_size:
        cache_files = random.sample(cache_files, args.sample_size)
        print(f"Sampling {len(cache_files)} files for validation")

    # Validate files
    results = []
    for f in cache_files:
        result = validate_cache_file(f)
        result['filepath'] = f
        results.append(result)

    # Summary
    valid_count = sum(1 for r in results if r['valid'])
    v2_count = sum(1 for r in results if r['is_v2'])
    has_image_count = sum(1 for r in results if r.get('stats', {}).get('has_image', False))

    print("\n" + "="*60)
    print("VALIDATION SUMMARY")
    print("="*60)
    print(f"  Total checked: {len(results)}")
    print(f"  Valid: {valid_count} ({100*valid_count/len(results):.1f}%)")
    print(f"  V2 format (complete offline): {v2_count} ({100*v2_count/len(results):.1f}%)")
    print(f"  Has cached image: {has_image_count} ({100*has_image_count/len(results):.1f}%)")

    # Issue breakdown
    all_issues = []
    for r in results:
        all_issues.extend(r['issues'])

    if all_issues:
        print("\nIssue breakdown:")
        for issue, count in Counter(all_issues).most_common():
            print(f"  {issue}: {count}")

    # Stats for valid v2 files
    v2_results = [r for r in results if r['is_v2']]
    if v2_results:
        avg_wallets = sum(r['stats']['num_wallets_cached'] for r in v2_results) / len(v2_results)
        avg_trades = sum(r['stats']['num_trades'] for r in v2_results) / len(v2_results)
        print(f"\nV2 file stats:")
        print(f"  Avg wallets cached: {avg_wallets:.1f}")
        print(f"  Avg trades: {avg_trades:.1f}")

    # Show sample invalid files
    invalid_results = [r for r in results if not r['valid']]
    if invalid_results:
        print("\nSample invalid files:")
        for r in invalid_results[:5]:
            print(f"  {r['filepath'].name}: {r['issues'][:2]}")

    # Recommendation
    print("\n" + "="*60)
    if v2_count == len(results):
        print("All files are V2 format. Ready for offline training!")
    elif v2_count == 0:
        print("No V2 files found. Run: python scripts/migrate_cache_v2.py --cache_dir " + str(cache_dir))
    else:
        print(f"Mixed cache: {v2_count} V2, {len(results)-v2_count} old format.")
        print("Old format files will use empty wallet/graph data during training.")
        print("Run migrate_cache_v2.py to upgrade remaining files.")


if __name__ == "__main__":
    main()