hf-viz / backend /scripts /export_binary.py
midah's picture
Apply clean grayscale design, remove all emojis
4fac556
"""
Export minimal dataset to binary format for fast client-side loading.
This creates a compact binary representation optimized for WebGL rendering.
"""
import struct
import json
import numpy as np
import pandas as pd
from pathlib import Path
import sys
import os
# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.data_loader import ModelDataLoader
from utils.dimensionality_reduction import DimensionReducer
from utils.embeddings import ModelEmbedder
def calculate_family_depths(df: pd.DataFrame) -> dict:
"""Calculate depth of each model in its family tree."""
depths = {}
def get_depth(model_id: str, visited: set = None) -> int:
if visited is None:
visited = set()
if model_id in visited:
return 0 # Cycle detected
visited.add(model_id)
if model_id in depths:
return depths[model_id]
parent_col = df.get('parent_model', pd.Series([None] * len(df), index=df.index))
model_row = df[df['model_id'] == model_id]
if model_row.empty:
depths[model_id] = 0
return 0
parent = model_row.iloc[0].get('parent_model')
if pd.isna(parent) or parent == '' or str(parent) == 'nan':
depths[model_id] = 0
return 0
parent_depth = get_depth(str(parent), visited.copy())
depth = parent_depth + 1
depths[model_id] = depth
return depth
for model_id in df['model_id'].unique():
if model_id not in depths:
get_depth(str(model_id))
return depths
def export_binary_dataset(df: pd.DataFrame, reduced_embeddings: np.ndarray, output_dir: Path):
"""
Export minimal dataset to binary format for fast client-side loading.
Binary format:
- Header (64 bytes): magic, version, counts, lookup table sizes
- Domain lookup table (32 bytes per domain)
- License lookup table (32 bytes per license)
- Family lookup table (32 bytes per family)
- Model records (16 bytes each): x, y, z, domain_id, license_id, family_id, flags
"""
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Exporting {len(df)} models to binary format...")
# Ensure we have coordinates
if 'x' not in df.columns or 'y' not in df.columns:
if reduced_embeddings is None or len(reduced_embeddings) != len(df):
raise ValueError("Need reduced embeddings to generate coordinates")
df['x'] = reduced_embeddings[:, 0] if reduced_embeddings.shape[1] > 0 else 0.0
df['y'] = reduced_embeddings[:, 1] if reduced_embeddings.shape[1] > 1 else 0.0
df['z'] = reduced_embeddings[:, 2] if reduced_embeddings.shape[1] > 2 else 0.0
# Create lookup tables
# Domain = library_name
domains = sorted(df['library_name'].dropna().astype(str).unique())
domains = [d for d in domains if d and d != 'nan'][:255] # Limit to 255
# License
licenses = sorted(df['license'].dropna().astype(str).unique())
licenses = [l for l in licenses if l and l != 'nan'][:255] # Limit to 255
# Family ID mapping (use parent_model to create family groups)
family_depths = calculate_family_depths(df)
# Create family mapping: group models by root parent
def get_root_parent(model_id: str) -> str:
visited = set()
current = str(model_id)
while current in visited == False:
visited.add(current)
model_row = df[df['model_id'] == current]
if model_row.empty:
return current
parent = model_row.iloc[0].get('parent_model')
if pd.isna(parent) or parent == '' or str(parent) == 'nan':
return current
current = str(parent)
return current
root_parents = {}
family_counter = 0
for model_id in df['model_id'].unique():
root = get_root_parent(str(model_id))
if root not in root_parents:
root_parents[root] = family_counter
family_counter += 1
# Map each model to its family
model_to_family = {}
for model_id in df['model_id'].unique():
root = get_root_parent(str(model_id))
model_to_family[str(model_id)] = root_parents.get(root, 65535)
# Limit families to 65535 (u16 max)
if len(root_parents) > 65535:
# Use hash-based family IDs
import hashlib
for model_id in df['model_id'].unique():
root = get_root_parent(str(model_id))
family_hash = int(hashlib.md5(root.encode()).hexdigest()[:4], 16) % 65535
model_to_family[str(model_id)] = family_hash
# Prepare model records
records = []
model_ids = []
for idx, row in df.iterrows():
model_id = str(row['model_id'])
model_ids.append(model_id)
# Get coordinates
x = float(row.get('x', 0.0))
y = float(row.get('y', 0.0))
z = float(row.get('z', 0.0))
# Encode domain (library_name)
domain_str = str(row.get('library_name', ''))
domain_id = domains.index(domain_str) if domain_str in domains else 255
# Encode license
license_str = str(row.get('license', ''))
license_id = licenses.index(license_str) if license_str in licenses else 255
# Encode family
family_id = model_to_family.get(model_id, 65535)
# Encode flags
flags = 0
parent = row.get('parent_model')
if pd.isna(parent) or parent == '' or str(parent) == 'nan':
flags |= 0x01 # is_base_model
# Check if has children (simple check - could be improved)
children = df[df['parent_model'] == model_id]
if len(children) > 0:
flags |= 0x04 # has_children
elif not pd.isna(parent) and parent != '' and str(parent) != 'nan':
flags |= 0x02 # has_parent
# Pack record: f32 x, f32 y, f32 z, u8 domain, u8 license, u16 family, u8 flags
records.append(struct.pack('fffBBBH', x, y, z, domain_id, license_id, family_id, flags))
num_models = len(records)
# Write binary file
with open(output_dir / 'embeddings.bin', 'wb') as f:
# Header (64 bytes)
header = struct.pack('5sBIIIBBH50s',
b'HFVIZ', # magic (5 bytes)
1, # version (1 byte)
num_models, # num_models (4 bytes)
len(domains), # num_domains (4 bytes)
len(licenses), # num_licenses (4 bytes)
len(set(model_to_family.values())), # num_families (4 bytes)
0, # reserved (1 byte)
0, # reserved (1 byte)
0, # reserved (2 bytes)
b'\x00' * 50 # padding (50 bytes)
)
f.write(header)
# Domain lookup table (32 bytes per domain, null-terminated)
for domain in domains:
domain_bytes = domain.encode('utf-8')[:31]
f.write(domain_bytes.ljust(32, b'\x00'))
# License lookup table (32 bytes per license)
for license in licenses:
license_bytes = license.encode('utf-8')[:31]
f.write(license_bytes.ljust(32, b'\x00'))
# Model records
f.write(b''.join(records))
# Write model IDs JSON (separate file for string table)
with open(output_dir / 'model_ids.json', 'w') as f:
json.dump(model_ids, f)
# Write metadata JSON
metadata = {
'domains': domains,
'licenses': licenses,
'num_models': num_models,
'num_families': len(set(model_to_family.values())),
'version': 1
}
with open(output_dir / 'metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
binary_size = (output_dir / 'embeddings.bin').stat().st_size
json_size = (output_dir / 'model_ids.json').stat().st_size
print(f"βœ“ Exported {num_models} models")
print(f"βœ“ Binary size: {binary_size / 1024 / 1024:.2f} MB")
print(f"βœ“ Model IDs JSON: {json_size / 1024 / 1024:.2f} MB")
print(f"βœ“ Total: {(binary_size + json_size) / 1024 / 1024:.2f} MB")
print(f"βœ“ Domains: {len(domains)}")
print(f"βœ“ Licenses: {len(licenses)}")
print(f"βœ“ Families: {len(set(model_to_family.values()))}")
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Export dataset to binary format')
parser.add_argument('--output', type=str, default='backend/cache/binary', help='Output directory')
parser.add_argument('--sample-size', type=int, default=None, help='Sample size (for testing)')
args = parser.parse_args()
output_dir = Path(args.output)
# Load data
print("Loading dataset...")
data_loader = ModelDataLoader()
df = data_loader.load_data(sample_size=args.sample_size)
df = data_loader.preprocess_for_embedding(df)
# Generate embeddings and reduce dimensions if needed
if 'x' not in df.columns or 'y' not in df.columns:
print("Generating embeddings...")
embedder = ModelEmbedder()
embeddings = embedder.generate_embeddings(df['combined_text'].tolist())
print("Reducing dimensions...")
reducer = DimensionReducer()
reduced_embeddings = reducer.reduce_dimensions(embeddings, n_components=3, method='umap')
else:
reduced_embeddings = None
# Export
export_binary_dataset(df, reduced_embeddings, output_dir)
print("Done!")