| | """ |
| | Export minimal dataset to binary format for fast client-side loading. |
| | This creates a compact binary representation optimized for WebGL rendering. |
| | """ |
| | import struct |
| | import json |
| | import numpy as np |
| | import pandas as pd |
| | from pathlib import Path |
| | import sys |
| | import os |
| |
|
| | |
| | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| |
|
| | from utils.data_loader import ModelDataLoader |
| | from utils.dimensionality_reduction import DimensionReducer |
| | from utils.embeddings import ModelEmbedder |
| |
|
| |
|
| | def calculate_family_depths(df: pd.DataFrame) -> dict: |
| | """Calculate depth of each model in its family tree.""" |
| | depths = {} |
| | |
| | def get_depth(model_id: str, visited: set = None) -> int: |
| | if visited is None: |
| | visited = set() |
| | if model_id in visited: |
| | return 0 |
| | visited.add(model_id) |
| | |
| | if model_id in depths: |
| | return depths[model_id] |
| | |
| | parent_col = df.get('parent_model', pd.Series([None] * len(df), index=df.index)) |
| | model_row = df[df['model_id'] == model_id] |
| | |
| | if model_row.empty: |
| | depths[model_id] = 0 |
| | return 0 |
| | |
| | parent = model_row.iloc[0].get('parent_model') |
| | if pd.isna(parent) or parent == '' or str(parent) == 'nan': |
| | depths[model_id] = 0 |
| | return 0 |
| | |
| | parent_depth = get_depth(str(parent), visited.copy()) |
| | depth = parent_depth + 1 |
| | depths[model_id] = depth |
| | return depth |
| | |
| | for model_id in df['model_id'].unique(): |
| | if model_id not in depths: |
| | get_depth(str(model_id)) |
| | |
| | return depths |
| |
|
| |
|
| | def export_binary_dataset(df: pd.DataFrame, reduced_embeddings: np.ndarray, output_dir: Path): |
| | """ |
| | Export minimal dataset to binary format for fast client-side loading. |
| | |
| | Binary format: |
| | - Header (64 bytes): magic, version, counts, lookup table sizes |
| | - Domain lookup table (32 bytes per domain) |
| | - License lookup table (32 bytes per license) |
| | - Family lookup table (32 bytes per family) |
| | - Model records (16 bytes each): x, y, z, domain_id, license_id, family_id, flags |
| | """ |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | print(f"Exporting {len(df)} models to binary format...") |
| | |
| | |
| | if 'x' not in df.columns or 'y' not in df.columns: |
| | if reduced_embeddings is None or len(reduced_embeddings) != len(df): |
| | raise ValueError("Need reduced embeddings to generate coordinates") |
| | |
| | df['x'] = reduced_embeddings[:, 0] if reduced_embeddings.shape[1] > 0 else 0.0 |
| | df['y'] = reduced_embeddings[:, 1] if reduced_embeddings.shape[1] > 1 else 0.0 |
| | df['z'] = reduced_embeddings[:, 2] if reduced_embeddings.shape[1] > 2 else 0.0 |
| | |
| | |
| | |
| | domains = sorted(df['library_name'].dropna().astype(str).unique()) |
| | domains = [d for d in domains if d and d != 'nan'][:255] |
| | |
| | |
| | licenses = sorted(df['license'].dropna().astype(str).unique()) |
| | licenses = [l for l in licenses if l and l != 'nan'][:255] |
| | |
| | |
| | family_depths = calculate_family_depths(df) |
| | |
| | |
| | def get_root_parent(model_id: str) -> str: |
| | visited = set() |
| | current = str(model_id) |
| | while current in visited == False: |
| | visited.add(current) |
| | model_row = df[df['model_id'] == current] |
| | if model_row.empty: |
| | return current |
| | parent = model_row.iloc[0].get('parent_model') |
| | if pd.isna(parent) or parent == '' or str(parent) == 'nan': |
| | return current |
| | current = str(parent) |
| | return current |
| | |
| | root_parents = {} |
| | family_counter = 0 |
| | for model_id in df['model_id'].unique(): |
| | root = get_root_parent(str(model_id)) |
| | if root not in root_parents: |
| | root_parents[root] = family_counter |
| | family_counter += 1 |
| | |
| | |
| | model_to_family = {} |
| | for model_id in df['model_id'].unique(): |
| | root = get_root_parent(str(model_id)) |
| | model_to_family[str(model_id)] = root_parents.get(root, 65535) |
| | |
| | |
| | if len(root_parents) > 65535: |
| | |
| | import hashlib |
| | for model_id in df['model_id'].unique(): |
| | root = get_root_parent(str(model_id)) |
| | family_hash = int(hashlib.md5(root.encode()).hexdigest()[:4], 16) % 65535 |
| | model_to_family[str(model_id)] = family_hash |
| | |
| | |
| | records = [] |
| | model_ids = [] |
| | |
| | for idx, row in df.iterrows(): |
| | model_id = str(row['model_id']) |
| | model_ids.append(model_id) |
| | |
| | |
| | x = float(row.get('x', 0.0)) |
| | y = float(row.get('y', 0.0)) |
| | z = float(row.get('z', 0.0)) |
| | |
| | |
| | domain_str = str(row.get('library_name', '')) |
| | domain_id = domains.index(domain_str) if domain_str in domains else 255 |
| | |
| | |
| | license_str = str(row.get('license', '')) |
| | license_id = licenses.index(license_str) if license_str in licenses else 255 |
| | |
| | |
| | family_id = model_to_family.get(model_id, 65535) |
| | |
| | |
| | flags = 0 |
| | parent = row.get('parent_model') |
| | if pd.isna(parent) or parent == '' or str(parent) == 'nan': |
| | flags |= 0x01 |
| | |
| | |
| | children = df[df['parent_model'] == model_id] |
| | if len(children) > 0: |
| | flags |= 0x04 |
| | elif not pd.isna(parent) and parent != '' and str(parent) != 'nan': |
| | flags |= 0x02 |
| | |
| | |
| | records.append(struct.pack('fffBBBH', x, y, z, domain_id, license_id, family_id, flags)) |
| | |
| | num_models = len(records) |
| | |
| | |
| | with open(output_dir / 'embeddings.bin', 'wb') as f: |
| | |
| | header = struct.pack('5sBIIIBBH50s', |
| | b'HFVIZ', |
| | 1, |
| | num_models, |
| | len(domains), |
| | len(licenses), |
| | len(set(model_to_family.values())), |
| | 0, |
| | 0, |
| | 0, |
| | b'\x00' * 50 |
| | ) |
| | f.write(header) |
| | |
| | |
| | for domain in domains: |
| | domain_bytes = domain.encode('utf-8')[:31] |
| | f.write(domain_bytes.ljust(32, b'\x00')) |
| | |
| | |
| | for license in licenses: |
| | license_bytes = license.encode('utf-8')[:31] |
| | f.write(license_bytes.ljust(32, b'\x00')) |
| | |
| | |
| | f.write(b''.join(records)) |
| | |
| | |
| | with open(output_dir / 'model_ids.json', 'w') as f: |
| | json.dump(model_ids, f) |
| | |
| | |
| | metadata = { |
| | 'domains': domains, |
| | 'licenses': licenses, |
| | 'num_models': num_models, |
| | 'num_families': len(set(model_to_family.values())), |
| | 'version': 1 |
| | } |
| | with open(output_dir / 'metadata.json', 'w') as f: |
| | json.dump(metadata, f, indent=2) |
| | |
| | binary_size = (output_dir / 'embeddings.bin').stat().st_size |
| | json_size = (output_dir / 'model_ids.json').stat().st_size |
| | |
| | print(f"β Exported {num_models} models") |
| | print(f"β Binary size: {binary_size / 1024 / 1024:.2f} MB") |
| | print(f"β Model IDs JSON: {json_size / 1024 / 1024:.2f} MB") |
| | print(f"β Total: {(binary_size + json_size) / 1024 / 1024:.2f} MB") |
| | print(f"β Domains: {len(domains)}") |
| | print(f"β Licenses: {len(licenses)}") |
| | print(f"β Families: {len(set(model_to_family.values()))}") |
| |
|
| |
|
| | if __name__ == '__main__': |
| | import argparse |
| | |
| | parser = argparse.ArgumentParser(description='Export dataset to binary format') |
| | parser.add_argument('--output', type=str, default='backend/cache/binary', help='Output directory') |
| | parser.add_argument('--sample-size', type=int, default=None, help='Sample size (for testing)') |
| | args = parser.parse_args() |
| | |
| | output_dir = Path(args.output) |
| | |
| | |
| | print("Loading dataset...") |
| | data_loader = ModelDataLoader() |
| | df = data_loader.load_data(sample_size=args.sample_size) |
| | df = data_loader.preprocess_for_embedding(df) |
| | |
| | |
| | if 'x' not in df.columns or 'y' not in df.columns: |
| | print("Generating embeddings...") |
| | embedder = ModelEmbedder() |
| | embeddings = embedder.generate_embeddings(df['combined_text'].tolist()) |
| | |
| | print("Reducing dimensions...") |
| | reducer = DimensionReducer() |
| | reduced_embeddings = reducer.reduce_dimensions(embeddings, n_components=3, method='umap') |
| | else: |
| | reduced_embeddings = None |
| | |
| | |
| | export_binary_dataset(df, reduced_embeddings, output_dir) |
| | print("Done!") |
| |
|
| |
|