File size: 13,293 Bytes
2ba2072
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""
Pre-compute force-directed layout positions for the full model network.
Uses graph-tool or networkx with Barnes-Hut optimization for large-scale layouts.

This script generates x, y, z coordinates for all nodes so the frontend
doesn't need to compute force simulation in real-time.

Usage:
    python precompute_force_layout.py [--output force_layout.pkl] [--3d]
"""

import os
import sys
import time
import pickle
import argparse
import logging
from pathlib import Path
from typing import Dict, Tuple, Optional
import numpy as np

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent.parent))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def load_model_data() -> 'pd.DataFrame':
    """Load model data from precomputed parquet or CSV."""
    import pandas as pd
    
    backend_dir = Path(__file__).parent.parent
    root_dir = backend_dir.parent
    
    # Try precomputed data first
    precomputed_dir = root_dir / "precomputed_data"
    if precomputed_dir.exists():
        parquet_files = list(precomputed_dir.glob("*.parquet"))
        if parquet_files:
            logger.info(f"Loading from precomputed parquet: {parquet_files[0]}")
            return pd.read_parquet(parquet_files[0])
    
    # Try CSV data
    csv_path = precomputed_dir / "models.csv"
    if csv_path.exists():
        logger.info(f"Loading from CSV: {csv_path}")
        return pd.read_csv(csv_path)
    
    # Try data directory
    data_dir = root_dir / "data"
    if data_dir.exists():
        csv_files = list(data_dir.glob("*.csv"))
        for csv_file in csv_files:
            if "model" in csv_file.name.lower():
                logger.info(f"Loading from {csv_file}")
                return pd.read_csv(csv_file)
    
    raise FileNotFoundError("No model data found")


def load_existing_graph(graph_path: str = None) -> Optional['nx.DiGraph']:
    """Load pre-existing networkx graph from pickle file."""
    import networkx as nx
    
    if graph_path and Path(graph_path).exists():
        logger.info(f"Loading existing graph from {graph_path}")
        with open(graph_path, 'rb') as f:
            return pickle.load(f)
    
    # Search for graph file
    search_paths = [
        Path(__file__).parent.parent.parent / "ai-ecosystem" / "data" / "ai_ecosystem_graph.pkl",
        Path(__file__).parent.parent.parent.parent / "ai-ecosystem" / "data" / "ai_ecosystem_graph.pkl",
        Path.home() / "ai-ecosystem-v2" / "ai-ecosystem" / "data" / "ai_ecosystem_graph.pkl",
    ]
    
    for path in search_paths:
        if path.exists():
            logger.info(f"Found existing graph at {path}")
            with open(path, 'rb') as f:
                return pickle.load(f)
    
    return None


def build_network_graph(df: 'pd.DataFrame') -> 'nx.DiGraph':
    """Build network graph from model dataframe."""
    import networkx as nx
    
    logger.info(f"Building network from {len(df):,} models...")
    G = nx.DiGraph()
    
    # Add all models as nodes
    for _, row in df.iterrows():
        model_id = str(row.get('model_id', row.get('modelId', '')))
        if not model_id:
            continue
            
        G.add_node(model_id, 
            downloads=row.get('downloads', 0),
            likes=row.get('likes', 0),
            library=row.get('library_name', row.get('library', '')),
            pipeline=row.get('pipeline_tag', '')
        )
    
    # Add edges based on parent relationships
    edge_count = 0
    for _, row in df.iterrows():
        model_id = str(row.get('model_id', row.get('modelId', '')))
        parent_id = row.get('parent_model', row.get('base_model', None))
        
        if not model_id:
            continue
            
        if pd.notna(parent_id) and str(parent_id).strip() and str(parent_id) != 'nan':
            parent_id = str(parent_id).strip()
            if parent_id in G.nodes:
                G.add_edge(parent_id, model_id, edge_type='derivative')
                edge_count += 1
    
    logger.info(f"Network: {G.number_of_nodes():,} nodes, {edge_count:,} edges")
    return G


def compute_force_layout_3d(
    G: 'nx.Graph',
    iterations: int = 100,
    seed: int = 42
) -> Dict[str, Tuple[float, float, float]]:
    """
    Compute 3D force-directed layout using networkx spring_layout.
    For very large graphs, uses Barnes-Hut approximation.
    """
    import networkx as nx
    
    n_nodes = G.number_of_nodes()
    logger.info(f"Computing 3D layout for {n_nodes:,} nodes...")
    
    if n_nodes == 0:
        return {}
    
    start_time = time.time()
    
    # For large graphs, compute layout on largest connected component first
    if n_nodes > 100000:
        logger.info("Large graph detected - using optimized approach...")
        
        # Get largest connected component (treat as undirected)
        if isinstance(G, nx.DiGraph):
            G_undirected = G.to_undirected()
        else:
            G_undirected = G
            
        components = list(nx.connected_components(G_undirected))
        components.sort(key=len, reverse=True)
        
        logger.info(f"Found {len(components):,} connected components")
        
        # Compute layouts for each component
        positions = {}
        offset_x = 0
        
        for i, component in enumerate(components):
            if len(component) < 2:
                # Isolated nodes - place randomly
                for node in component:
                    positions[node] = (
                        offset_x + np.random.randn() * 10,
                        np.random.randn() * 100,
                        np.random.randn() * 100
                    )
                continue
            
            subgraph = G_undirected.subgraph(component)
            
            # Use spring layout with reduced iterations for large components
            iter_count = min(iterations, max(20, 100 - len(component) // 10000))
            
            logger.info(f"  Component {i+1}/{len(components)}: {len(component):,} nodes, {iter_count} iterations")
            
            try:
                # 3D layout using spring_layout
                pos_2d = nx.spring_layout(
                    subgraph,
                    dim=3,
                    k=1.0 / np.sqrt(len(component)),
                    iterations=iter_count,
                    seed=seed + i,
                    scale=100 * np.log10(max(len(component), 10))
                )
                
                # Apply offset to separate components
                for node, (x, y, z) in pos_2d.items():
                    positions[node] = (x + offset_x, y, z)
                
                # Move offset for next component
                offset_x += 300 * np.log10(max(len(component), 10))
                
            except Exception as e:
                logger.warning(f"Layout failed for component {i}: {e}")
                # Fallback: random positions
                for node in component:
                    positions[node] = (
                        offset_x + np.random.randn() * 50,
                        np.random.randn() * 50,
                        np.random.randn() * 50
                    )
    else:
        # Standard approach for smaller graphs
        try:
            positions_raw = nx.spring_layout(
                G.to_undirected() if isinstance(G, nx.DiGraph) else G,
                dim=3,
                k=2.0 / np.sqrt(n_nodes) if n_nodes > 0 else 1.0,
                iterations=iterations,
                seed=seed,
                scale=200
            )
            positions = {node: tuple(pos) for node, pos in positions_raw.items()}
        except Exception as e:
            logger.warning(f"Spring layout failed: {e}, using random positions")
            np.random.seed(seed)
            positions = {
                node: (np.random.randn() * 100, np.random.randn() * 100, np.random.randn() * 100)
                for node in G.nodes()
            }
    
    elapsed = time.time() - start_time
    logger.info(f"Layout computed in {elapsed:.1f}s")
    
    return positions


def compute_force_layout_fa2(
    G: 'nx.Graph',
    iterations: int = 100,
    seed: int = 42
) -> Dict[str, Tuple[float, float, float]]:
    """
    Compute layout using ForceAtlas2 algorithm (faster for large graphs).
    Falls back to spring_layout if fa2 not available.
    """
    try:
        from fa2 import ForceAtlas2
        
        n_nodes = G.number_of_nodes()
        logger.info(f"Computing FA2 layout for {n_nodes:,} nodes...")
        
        if n_nodes == 0:
            return {}
        
        # Convert to undirected for layout
        if isinstance(G, nx.DiGraph):
            import networkx as nx
            G_layout = G.to_undirected()
        else:
            G_layout = G
        
        # Initialize ForceAtlas2
        fa2 = ForceAtlas2(
            outboundAttractionDistribution=True,
            linLogMode=False,
            adjustSizes=False,
            edgeWeightInfluence=1.0,
            jitterTolerance=1.0,
            barnesHutOptimize=True,
            barnesHutTheta=1.2,
            multiThreaded=False,
            scalingRatio=2.0,
            strongGravityMode=False,
            gravity=1.0,
            verbose=False
        )
        
        # Compute 2D positions
        positions_2d = fa2.forceatlas2_networkx_layout(
            G_layout,
            iterations=iterations
        )
        
        # Add 3rd dimension based on hierarchy/properties
        np.random.seed(seed)
        positions = {}
        for node, (x, y) in positions_2d.items():
            # Z based on downloads (popular models higher)
            downloads = G.nodes[node].get('downloads', 0) if node in G.nodes else 0
            z = np.log10(max(downloads, 1)) * 10 + np.random.randn() * 5
            positions[node] = (x * 100, y * 100, z)
        
        return positions
        
    except ImportError:
        logger.warning("fa2 not installed, falling back to spring_layout")
        return compute_force_layout_3d(G, iterations, seed)


def save_layout(
    positions: Dict[str, Tuple[float, float, float]],
    output_path: str,
    graph: 'nx.Graph' = None
):
    """Save layout positions to pickle file."""
    
    data = {
        'positions': positions,
        'n_nodes': len(positions),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
    }
    
    if graph is not None:
        data['n_edges'] = graph.number_of_edges()
    
    # Calculate bounds
    if positions:
        xs = [p[0] for p in positions.values()]
        ys = [p[1] for p in positions.values()]
        zs = [p[2] for p in positions.values()]
        data['bounds'] = {
            'x_min': min(xs), 'x_max': max(xs),
            'y_min': min(ys), 'y_max': max(ys),
            'z_min': min(zs), 'z_max': max(zs),
        }
    
    with open(output_path, 'wb') as f:
        pickle.dump(data, f)
    
    logger.info(f"Saved layout to {output_path}")
    logger.info(f"  Nodes: {len(positions):,}")
    if 'bounds' in data:
        b = data['bounds']
        logger.info(f"  Bounds: X[{b['x_min']:.1f}, {b['x_max']:.1f}], Y[{b['y_min']:.1f}, {b['y_max']:.1f}], Z[{b['z_min']:.1f}, {b['z_max']:.1f}]")


def main():
    parser = argparse.ArgumentParser(description='Pre-compute force-directed layout')
    parser.add_argument('--output', '-o', type=str, default='force_layout_3d.pkl',
                       help='Output pickle file path')
    parser.add_argument('--iterations', '-i', type=int, default=100,
                       help='Number of layout iterations')
    parser.add_argument('--algorithm', '-a', choices=['spring', 'fa2'], default='spring',
                       help='Layout algorithm to use')
    parser.add_argument('--seed', '-s', type=int, default=42,
                       help='Random seed for reproducibility')
    parser.add_argument('--graph', '-g', type=str, default=None,
                       help='Path to existing networkx graph pickle file')
    
    args = parser.parse_args()
    
    # Determine output path
    backend_dir = Path(__file__).parent.parent
    root_dir = backend_dir.parent
    precomputed_dir = root_dir / "precomputed_data"
    precomputed_dir.mkdir(exist_ok=True)
    
    output_path = precomputed_dir / args.output
    
    logger.info("=" * 60)
    logger.info("Pre-computing Force-Directed Layout")
    logger.info("=" * 60)
    
    # Try to load existing graph first (faster)
    G = load_existing_graph(args.graph)
    
    if G is None:
        # Load data and build graph
        df = load_model_data()
        logger.info(f"Loaded {len(df):,} models")
        G = build_network_graph(df)
    else:
        logger.info(f"Using existing graph: {G.number_of_nodes():,} nodes, {G.number_of_edges():,} edges")
    
    # Compute layout
    if args.algorithm == 'fa2':
        positions = compute_force_layout_fa2(G, args.iterations, args.seed)
    else:
        positions = compute_force_layout_3d(G, args.iterations, args.seed)
    
    # Save
    save_layout(positions, str(output_path), G)
    
    logger.info("=" * 60)
    logger.info("Done!")
    logger.info("=" * 60)


if __name__ == "__main__":
    main()