| | """ |
| | Pre-compute the full derivative network graph and save it to disk. |
| | This allows the API to load the network instantly instead of building it on-demand. |
| | |
| | Usage: |
| | python scripts/precompute_network.py [--output-dir precomputed_data] [--version v1] |
| | """ |
| |
|
| | import os |
| | import sys |
| | import pickle |
| | import argparse |
| | import logging |
| | import time |
| | from pathlib import Path |
| | from typing import Optional |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent.parent)) |
| |
|
| | import pandas as pd |
| | from utils.network_analysis import ModelNetworkBuilder |
| | from utils.precomputed_loader import PrecomputedDataLoader |
| | from utils.data_loader import ModelDataLoader |
| |
|
| | logging.basicConfig( |
| | level=logging.INFO, |
| | format='%(asctime)s - %(levelname)s - %(message)s' |
| | ) |
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | def precompute_network( |
| | output_dir: str = "precomputed_data", |
| | version: str = "v1", |
| | include_edge_attributes: bool = False, |
| | min_downloads: int = 0, |
| | max_nodes: Optional[int] = None, |
| | load_from_hf: bool = False, |
| | sample_size: Optional[int] = None |
| | ): |
| | """ |
| | Pre-compute the full derivative network graph for the force-directed visualization. |
| | |
| | Args: |
| | output_dir: Directory to save the network file |
| | version: Version tag for the data |
| | include_edge_attributes: Whether to calculate edge attributes |
| | min_downloads: Minimum downloads to include a model |
| | max_nodes: Maximum number of nodes (top N by downloads) |
| | load_from_hf: If True, load directly from HF dataset (includes parent relationships) |
| | sample_size: If load_from_hf=True, sample this many models (None = all models) |
| | """ |
| | start_time = time.time() |
| | |
| | |
| | output_path = Path(output_dir) |
| | output_path.mkdir(parents=True, exist_ok=True) |
| | |
| | logger.info("=" * 60) |
| | logger.info("PRE-COMPUTING FULL DERIVATIVE NETWORK") |
| | logger.info("=" * 60) |
| | |
| | |
| | logger.info("Step 1/3: Loading model data...") |
| | |
| | if load_from_hf: |
| | |
| | logger.info(f"Loading directly from Hugging Face dataset (sample_size={sample_size if sample_size else 'ALL'})...") |
| | data_loader = ModelDataLoader() |
| | df = data_loader.load_data(sample_size=sample_size, prioritize_base_models=False) |
| | df = data_loader.preprocess_for_embedding(df) |
| | |
| | |
| | if 'model_id' in df.columns: |
| | df.set_index('model_id', drop=False, inplace=True) |
| | |
| | |
| | for col in ['downloads', 'likes']: |
| | if col in df.columns: |
| | df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int) |
| | |
| | logger.info(f"Loaded {len(df):,} models from HF dataset") |
| | logger.info(f"Columns: {list(df.columns)}") |
| | |
| | |
| | if 'parent_model' not in df.columns: |
| | logger.warning("'parent_model' column not found - network will have 0 edges!") |
| | else: |
| | parent_count = df['parent_model'].notna().sum() |
| | logger.info(f"Models with parent relationships: {parent_count:,}") |
| | else: |
| | |
| | loader = PrecomputedDataLoader(data_dir=output_dir, version=version) |
| | |
| | if not loader.check_available(): |
| | logger.error(f"Pre-computed data not found in {output_dir}") |
| | logger.info("Please run precompute_data.py first, download from HF Hub, or use --load-from-hf flag") |
| | return False |
| | |
| | try: |
| | df, embeddings, metadata = loader.load_all() |
| | logger.info(f"Loaded {len(df):,} models from pre-computed data") |
| | |
| | |
| | if 'parent_model' not in df.columns: |
| | logger.warning("'parent_model' column not found in pre-computed data - network will have 0 edges!") |
| | except Exception as e: |
| | logger.error(f"Failed to load data: {e}") |
| | return False |
| | |
| | |
| | if min_downloads > 0: |
| | df = df[df.get('downloads', 0) >= min_downloads] |
| | logger.info(f"Filtered to {len(df):,} models with >= {min_downloads} downloads") |
| | |
| | if max_nodes and len(df) > max_nodes: |
| | df = df.nlargest(max_nodes, 'downloads', keep='first') |
| | logger.info(f"Limited to top {max_nodes:,} models by downloads") |
| | |
| | |
| | logger.info("Step 2/3: Building network graph (this may take 10-30 minutes)...") |
| | try: |
| | network_builder = ModelNetworkBuilder(df) |
| | graph = network_builder.build_full_derivative_network( |
| | include_edge_attributes=include_edge_attributes, |
| | filter_edge_types=None |
| | ) |
| | |
| | logger.info(f"Graph built: {graph.number_of_nodes():,} nodes, {graph.number_of_edges():,} edges") |
| | except Exception as e: |
| | logger.error(f"Failed to build network graph: {e}", exc_info=True) |
| | return False |
| | |
| | |
| | logger.info("Step 3/3: Saving network graph to disk...") |
| | network_file = output_path / "full_derivative_network.pkl" |
| | |
| | try: |
| | with open(network_file, 'wb') as f: |
| | pickle.dump(graph, f, protocol=pickle.HIGHEST_PROTOCOL) |
| | |
| | file_size_mb = network_file.stat().st_size / (1024 * 1024) |
| | logger.info(f"Saved network graph to {network_file}") |
| | logger.info(f"File size: {file_size_mb:.2f} MB") |
| | except Exception as e: |
| | logger.error(f"Failed to save network graph: {e}", exc_info=True) |
| | return False |
| | |
| | |
| | metadata_file = output_path / "network_metadata.json" |
| | import json |
| | from datetime import datetime |
| | |
| | network_metadata = { |
| | "created_at": datetime.now().isoformat(), |
| | "version": version, |
| | "nodes": graph.number_of_nodes(), |
| | "edges": graph.number_of_edges(), |
| | "include_edge_attributes": include_edge_attributes, |
| | "min_downloads": min_downloads, |
| | "max_nodes": max_nodes, |
| | "file_size_mb": round(file_size_mb, 2) |
| | } |
| | |
| | with open(metadata_file, 'w') as f: |
| | json.dump(network_metadata, f, indent=2) |
| | |
| | total_time = time.time() - start_time |
| | logger.info("=" * 60) |
| | logger.info(f"PRE-COMPUTATION COMPLETE in {total_time:.2f} seconds") |
| | logger.info(f"Network graph: {graph.number_of_nodes():,} nodes, {graph.number_of_edges():,} edges") |
| | logger.info(f"Saved to: {network_file}") |
| | logger.info("=" * 60) |
| | |
| | return True |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import time |
| | |
| | parser = argparse.ArgumentParser(description="Pre-compute full derivative network graph") |
| | parser.add_argument("--output-dir", type=str, default="precomputed_data", |
| | help="Output directory for pre-computed files") |
| | parser.add_argument("--version", type=str, default="v1", |
| | help="Version tag for the data") |
| | parser.add_argument("--include-edge-attributes", action="store_true", |
| | help="Include edge attributes (slower but more detailed)") |
| | parser.add_argument("--min-downloads", type=int, default=0, |
| | help="Minimum downloads to include a model") |
| | parser.add_argument("--max-nodes", type=int, default=None, |
| | help="Maximum number of nodes (top N by downloads)") |
| | parser.add_argument("--load-from-hf", action="store_true", |
| | help="Load directly from HF dataset instead of pre-computed files (includes parent relationships)") |
| | parser.add_argument("--sample-size", type=int, default=None, |
| | help="If --load-from-hf, sample this many models (default: all models, use 0 for all)") |
| | |
| | args = parser.parse_args() |
| | |
| | success = precompute_network( |
| | output_dir=args.output_dir, |
| | version=args.version, |
| | include_edge_attributes=args.include_edge_attributes, |
| | min_downloads=args.min_downloads, |
| | max_nodes=args.max_nodes, |
| | load_from_hf=args.load_from_hf, |
| | sample_size=None if args.sample_size == 0 else args.sample_size |
| | ) |
| | |
| | sys.exit(0 if success else 1) |
| |
|
| |
|