""" Download and store HuggingFace models in device DB """ from typing import Optional, Dict, Any import os import json from huggingface_hub import snapshot_download import duckdb import numpy as np # Initialize HuggingFace token from environment HF_TOKEN = os.getenv("HF_TOKEN") def download_model(model_id: str, cache_dir: Optional[str] = None) -> str: """Download model from HuggingFace Hub""" print(f"Downloading model {model_id} from HuggingFace Hub") # Use default cache dir if none provided if cache_dir is None: cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "helium", "models") os.makedirs(cache_dir, exist_ok=True) # Download model snapshot local_path = snapshot_download( repo_id=model_id, cache_dir=cache_dir, resume_download=True, local_files_only=False # Always try to download first ) print(f"Model downloaded to {local_path}") return local_path def store_in_device_db( model_path: str, device_db_url: str, model_id: str ) -> None: """Store model weights and config in device DB""" print(f"Storing model {model_id} in device DB at {device_db_url}") # Connect to device DB conn = duckdb.connect(device_db_url) # Create tables if they don't exist conn.execute(""" CREATE TABLE IF NOT EXISTS model_configs ( model_id VARCHAR PRIMARY KEY, config JSON ); CREATE TABLE IF NOT EXISTS model_weights ( model_id VARCHAR, layer_name VARCHAR, weight_data BLOB, weight_dtype VARCHAR, weight_shape VARCHAR, PRIMARY KEY (model_id, layer_name) ); """) # Load and store config config_path = os.path.join(model_path, "config.json") with open(config_path, "r") as f: config = json.load(f) # Store config in DB conn.execute( "INSERT OR REPLACE INTO model_configs (model_id, config) VALUES (?, ?)", [model_id, json.dumps(config)] ) # Load and store weights for root, _, files in os.walk(model_path): for file in files: if file.endswith(".bin"): layer_name = os.path.splitext(file)[0] weight_path = os.path.join(root, file) # Load weight data weight_data = np.fromfile(weight_path, dtype=np.float32) weight_shape = weight_data.shape # Store weight data in DB conn.execute( """ INSERT OR REPLACE INTO model_weights (model_id, layer_name, weight_data, weight_dtype, weight_shape) VALUES (?, ?, ?, ?, ?) """, [model_id, layer_name, weight_data.tobytes(), str(weight_data.dtype), str(weight_shape)] ) conn.commit() conn.execute( "INSERT INTO model_configs (model_id, config) VALUES (?, ?)", [model_id, json.dumps(config)] ) # Load and store weights weight_files = [f for f in os.listdir(model_path) if f.endswith(".bin")] for weight_file in weight_files: layer_name = os.path.splitext(weight_file)[0] weight_path = os.path.join(model_path, weight_file) # Load weights as numpy array weights = np.load(weight_path) # Store in DB conn.execute( "INSERT INTO model_weights (model_id, layer_name, weight_data) VALUES (?, ?, ?)", [model_id, layer_name, weights.tobytes()] ) conn.close()