Spaces:

midah
/

hf-viz

Sleeping

App Files Files Community

hf-viz / backend /api /main.py

midah

Add network pre-computation, styling improvements, and theme toggle

3e85304 about 2 months ago

raw

history blame contribute delete

104 kB

	import sys
	import os
	import pickle
	import tempfile
	import logging
	from typing import Optional, List, Dict
	from datetime import datetime, timedelta

	import pandas as pd
	import numpy as np
	import httpx
	from fastapi import FastAPI, HTTPException, Query, BackgroundTasks, Request
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.middleware.gzip import GZipMiddleware
	from fastapi.responses import FileResponse, JSONResponse
	from fastapi.exceptions import RequestValidationError
	from starlette.exceptions import HTTPException as StarletteHTTPException
	from pydantic import BaseModel
	from umap import UMAP

	from utils.data_loader import ModelDataLoader
	from utils.embeddings import ModelEmbedder
	from utils.dimensionality_reduction import DimensionReducer
	from utils.network_analysis import ModelNetworkBuilder
	from utils.graph_embeddings import GraphEmbedder
	from services.model_tracker import get_tracker
	from services.arxiv_api import extract_arxiv_ids, fetch_arxiv_papers
	from core.config import settings
	from core.exceptions import DataNotLoadedError, EmbeddingsNotReadyError
	from models.schemas import ModelPoint
	from utils.family_tree import calculate_family_depths
	from utils.cache import cache, cached_response
	from utils.response_encoder import FastJSONResponse, MessagePackResponse, encode_models_msgpack
	import api.dependencies as deps
	from api.routes import models, stats, clusters

	# Create aliases for backward compatibility with existing routes
	# Note: These are set at module load time and may be None initially
	# Functions should access via deps.* to get current values
	data_loader = deps.data_loader

	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	if backend_dir not in sys.path:
	sys.path.insert(0, backend_dir)

	logger = logging.getLogger(__name__)

	app = FastAPI(title="HF Model Ecosystem API", version="2.0.0")

	app.add_middleware(GZipMiddleware, minimum_size=1000)

	CORS_HEADERS = {
	"Access-Control-Allow-Origin": "*",
	"Access-Control-Allow-Methods": "*",
	"Access-Control-Allow-Headers": "*",
	}

	@app.exception_handler(Exception)
	async def global_exception_handler(request: Request, exc: Exception):
	logger.exception("Unhandled exception", exc_info=exc)
	return JSONResponse(
	status_code=500,
	content={"detail": "Internal server error"},
	headers=CORS_HEADERS,
	)

	@app.exception_handler(StarletteHTTPException)
	async def http_exception_handler(request: Request, exc: StarletteHTTPException):
	return JSONResponse(
	status_code=exc.status_code,
	content={"detail": exc.detail},
	headers=CORS_HEADERS,
	)

	@app.exception_handler(RequestValidationError)
	async def validation_exception_handler(request: Request, exc: RequestValidationError):
	return JSONResponse(
	status_code=422,
	content={"detail": exc.errors()},
	headers=CORS_HEADERS,
	)

	if settings.ALLOW_ALL_ORIGINS:
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=False,
	allow_methods=["*"],
	allow_headers=["*"],
	)
	else:
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["http://localhost:3000", settings.FRONTEND_URL],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	# Include routers
	app.include_router(models.router)
	app.include_router(stats.router)
	app.include_router(clusters.router)

	@app.on_event("startup")
	async def startup_event():
	"""
	Fast startup using pre-computed data.
	Falls back to traditional loading if pre-computed data not available.
	"""
	import time
	startup_start = time.time()

	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)

	# Try to load pre-computed data first (instant startup!)
	from utils.precomputed_loader import get_precomputed_loader

	precomputed_loader = get_precomputed_loader(version="v1")

	if precomputed_loader:
	logger.info("=" * 60)
	logger.info("LOADING PRE-COMPUTED DATA (Fast Startup Mode)")
	logger.info("=" * 60)

	try:
	# Check if chunked embeddings are available
	is_chunked = precomputed_loader.is_chunked()

	# Load data - don't load embeddings if chunked (load on-demand instead)
	load_embeddings_at_startup = not is_chunked # Only load if not chunked
	deps.df, deps.embeddings, metadata = precomputed_loader.load_all(
	load_embeddings=load_embeddings_at_startup
	)

	# Initialize chunked loader if chunked data is available
	if is_chunked:
	chunked_loader = precomputed_loader.get_chunked_loader()
	if chunked_loader:
	deps.chunked_embedding_loader = chunked_loader
	logger.info("Chunked embedding loader initialized - embeddings will be loaded on-demand")
	else:
	logger.warning("Chunked data detected but chunked loader unavailable - falling back to full load")
	# Fallback: try to load all embeddings
	deps.df, deps.embeddings, metadata = precomputed_loader.load_all(load_embeddings=True)

	# Extract 3D coordinates from dataframe
	deps.reduced_embeddings = np.column_stack([
	deps.df['x_3d'].values,
	deps.df['y_3d'].values,
	deps.df['z_3d'].values
	])

	# Initialize embedder (without loading/generating embeddings)
	deps.embedder = ModelEmbedder()

	# Initialize reducer (already fitted)
	deps.reducer = DimensionReducer(method="umap", n_components=3)

	# No graph embeddings in fast mode (optional feature)
	deps.graph_embedder = None
	deps.graph_embeddings_dict = None
	deps.combined_embeddings = None
	deps.reduced_embeddings_graph = None

	startup_time = time.time() - startup_start
	logger.info("=" * 60)
	logger.info(f"STARTUP COMPLETE in {startup_time:.2f} seconds!")
	logger.info(f"Loaded {len(deps.df):,} models with pre-computed coordinates")
	if is_chunked:
	logger.info("Using chunked embeddings - fast startup mode enabled")
	logger.info(f"Unique libraries: {metadata.get('unique_libraries')}")
	logger.info(f"Unique pipelines: {metadata.get('unique_pipelines')}")
	logger.info("=" * 60)

	# Update module-level aliases
	df = deps.df
	embedder = deps.embedder
	reducer = deps.reducer
	embeddings = deps.embeddings
	reduced_embeddings = deps.reduced_embeddings

	return

	except Exception as e:
	logger.warning(f"Failed to load pre-computed data: {e}")
	logger.info("Falling back to traditional loading...")

	else:
	logger.info("=" * 60)
	logger.info("Pre-computed data not found.")
	logger.info("To enable fast startup, run:")
	logger.info(" cd backend && python scripts/precompute_data.py --sample-size 150000")
	logger.info("=" * 60)
	logger.info("Falling back to traditional loading (may take 1-8 hours)...")

	# Traditional loading (slow path)
	cache_dir = os.path.join(root_dir, "cache")
	os.makedirs(cache_dir, exist_ok=True)

	embeddings_cache = os.path.join(cache_dir, "embeddings.pkl")
	graph_embeddings_cache = os.path.join(cache_dir, "graph_embeddings.pkl")
	combined_embeddings_cache = os.path.join(cache_dir, "combined_embeddings.pkl")
	reduced_cache_umap = os.path.join(cache_dir, "reduced_umap_3d.pkl")
	reduced_cache_umap_graph = os.path.join(cache_dir, "reduced_umap_3d_graph.pkl")
	reducer_cache_umap = os.path.join(cache_dir, "reducer_umap_3d.pkl")
	reducer_cache_umap_graph = os.path.join(cache_dir, "reducer_umap_3d_graph.pkl")

	# Load dataset with sample (for reasonable startup time)
	sample_size = settings.SAMPLE_SIZE or settings.get_sample_size() or 5000
	logger.info(f"Loading dataset (sample_size={sample_size}, prioritizing base models)...")

	deps.df = deps.data_loader.load_data(sample_size=sample_size, prioritize_base_models=True)
	deps.df = deps.data_loader.preprocess_for_embedding(deps.df)

	if 'model_id' in deps.df.columns:
	deps.df.set_index('model_id', drop=False, inplace=True)
	for col in ['downloads', 'likes']:
	if col in deps.df.columns:
	deps.df[col] = pd.to_numeric(deps.df[col], errors='coerce').fillna(0).astype(int)

	deps.embedder = ModelEmbedder()

	# Load or generate text embeddings
	if os.path.exists(embeddings_cache):
	try:
	deps.embeddings = deps.embedder.load_embeddings(embeddings_cache)
	except (IOError, pickle.UnpicklingError, EOFError) as e:
	logger.warning(f"Failed to load cached embeddings: {e}")
	deps.embeddings = None

	if deps.embeddings is None:
	texts = deps.df['combined_text'].tolist()
	deps.embeddings = deps.embedder.generate_embeddings(texts, batch_size=128)
	deps.embedder.save_embeddings(deps.embeddings, embeddings_cache)

	# Skip graph embeddings in fallback mode (too slow)
	deps.graph_embedder = None
	deps.graph_embeddings_dict = None
	deps.combined_embeddings = None

	# Initialize reducer for text embeddings
	deps.reducer = DimensionReducer(method="umap", n_components=3)

	# Pre-compute clusters for faster requests
	logger.info("Pre-computing clusters...")

	if os.path.exists(reduced_cache_umap) and os.path.exists(reducer_cache_umap):
	try:
	with open(reduced_cache_umap, 'rb') as f:
	deps.reduced_embeddings = pickle.load(f)
	deps.reducer.load_reducer(reducer_cache_umap)
	except (IOError, pickle.UnpicklingError, EOFError) as e:
	logger.warning(f"Failed to load cached reduced embeddings: {e}")
	deps.reduced_embeddings = None

	if deps.reduced_embeddings is None:
	deps.reducer.reducer = UMAP(
	n_components=3,
	n_neighbors=30,
	min_dist=0.3,
	metric='cosine',
	random_state=42,
	n_jobs=-1,
	low_memory=True,
	spread=1.5
	)
	deps.reduced_embeddings = deps.reducer.fit_transform(deps.embeddings)
	with open(reduced_cache_umap, 'wb') as f:
	pickle.dump(deps.reduced_embeddings, f)
	deps.reducer.save_reducer(reducer_cache_umap)

	# No graph embeddings in fallback mode
	deps.reduced_embeddings_graph = None

	# Pre-compute clusters now instead of on first request
	if deps.reduced_embeddings is not None and len(deps.reduced_embeddings) > 0:
	models.cluster_labels = compute_clusters(
	deps.reduced_embeddings,
	n_clusters=min(50, len(deps.reduced_embeddings) // 100)
	)
	logger.info(f"Pre-computed {len(set(models.cluster_labels))} clusters")

	startup_time = time.time() - startup_start
	logger.info(f"Startup complete in {startup_time:.2f} seconds")

	# Update module-level aliases
	df = deps.df
	embedder = deps.embedder
	graph_embedder = deps.graph_embedder
	reducer = deps.reducer
	embeddings = deps.embeddings
	graph_embeddings_dict = deps.graph_embeddings_dict
	combined_embeddings = deps.combined_embeddings
	reduced_embeddings = deps.reduced_embeddings
	reduced_embeddings_graph = deps.reduced_embeddings_graph


	from utils.family_tree import calculate_family_depths


	def compute_clusters(reduced_embeddings: np.ndarray, n_clusters: int = 50) -> np.ndarray:
	from sklearn.cluster import KMeans

	n_samples = len(reduced_embeddings)
	if n_samples < n_clusters:
	n_clusters = max(1, n_samples // 10)

	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	return kmeans.fit_predict(reduced_embeddings)


	@app.get("/")
	async def root():
	# Check if frontend build exists and serve it
	_backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	_frontend_build_path = os.path.join(os.path.dirname(_backend_dir), "frontend", "build")
	index_path = os.path.join(_frontend_build_path, "index.html")

	if os.path.exists(index_path):
	from starlette.responses import FileResponse as StarletteFileResponse
	return StarletteFileResponse(index_path)

	# Fallback to API status when no frontend build
	return {"message": "HF Model Ecosystem API", "status": "running"}


	@app.get("/api/models")
	async def get_models(
	min_downloads: int = Query(0),
	min_likes: int = Query(0),
	search_query: Optional[str] = Query(None),
	color_by: str = Query("library_name"),
	size_by: str = Query("downloads"),
	max_points: Optional[int] = Query(10000), # REDUCED from None (was 50k default in frontend)
	projection_method: str = Query("umap"),
	base_models_only: bool = Query(False),
	max_hierarchy_depth: Optional[int] = Query(None, ge=0, description="Filter to models at or below this hierarchy depth."),
	use_graph_embeddings: bool = Query(False, description="Use graph-aware embeddings that respect family tree structure"),
	format: str = Query("json", regex="^(json\|msgpack)$", description="Response format: json or msgpack")
	):
	if deps.df is None:
	raise DataNotLoadedError()

	df = deps.df

	# Filter data
	filtered_df = data_loader.filter_data(
	df=df,
	min_downloads=min_downloads,
	min_likes=min_likes,
	search_query=search_query,
	libraries=None, # Can be added as query params
	pipeline_tags=None
	)

	if base_models_only:
	if 'parent_model' in filtered_df.columns:
	filtered_df = filtered_df[
	filtered_df['parent_model'].isna() \|
	(filtered_df['parent_model'].astype(str).str.strip() == '') \|
	(filtered_df['parent_model'].astype(str) == 'nan')
	]

	if max_hierarchy_depth is not None:
	family_depths = calculate_family_depths(df)
	filtered_df = filtered_df[
	filtered_df['model_id'].astype(str).map(lambda x: family_depths.get(x, 0) <= max_hierarchy_depth)
	]

	filtered_count = len(filtered_df)

	if len(filtered_df) == 0:
	return {
	"models": [],
	"filtered_count": 0,
	"returned_count": 0
	}

	# Handle max_points: None means no limit, very large number also means no limit
	effective_max_points = None if max_points is None or max_points >= 1000000 else max_points

	if effective_max_points is not None and len(filtered_df) > effective_max_points:
	if 'library_name' in filtered_df.columns and filtered_df['library_name'].notna().any():
	# Sample proportionally by library, preserving all columns
	sampled_dfs = []
	for lib_name, group in filtered_df.groupby('library_name', group_keys=False):
	n_samples = max(1, int(effective_max_points * len(group) / len(filtered_df)))
	sampled_dfs.append(group.sample(min(len(group), n_samples), random_state=42))
	filtered_df = pd.concat(sampled_dfs, ignore_index=True)
	if len(filtered_df) > effective_max_points:
	filtered_df = filtered_df.sample(n=effective_max_points, random_state=42).reset_index(drop=True)
	else:
	filtered_df = filtered_df.reset_index(drop=True)
	else:
	filtered_df = filtered_df.sample(n=effective_max_points, random_state=42).reset_index(drop=True)

	# Determine which embeddings to use
	if use_graph_embeddings and combined_embeddings is not None:
	current_embeddings = combined_embeddings
	current_reduced = reduced_embeddings_graph
	embedding_type = "graph-aware"
	else:
	if embeddings is None:
	raise EmbeddingsNotReadyError()
	current_embeddings = embeddings
	current_reduced = reduced_embeddings
	embedding_type = "text-only"

	# Handle reduced embeddings loading/generation
	if current_reduced is None or (reducer and reducer.method != projection_method.lower()):
	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)
	cache_dir = os.path.join(root_dir, "cache")
	cache_suffix = "_graph" if use_graph_embeddings and combined_embeddings is not None else ""
	reduced_cache = os.path.join(cache_dir, f"reduced_{projection_method.lower()}_3d{cache_suffix}.pkl")
	reducer_cache = os.path.join(cache_dir, f"reducer_{projection_method.lower()}_3d{cache_suffix}.pkl")

	if os.path.exists(reduced_cache) and os.path.exists(reducer_cache):
	try:
	with open(reduced_cache, 'rb') as f:
	current_reduced = pickle.load(f)
	if reducer is None or reducer.method != projection_method.lower():
	reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
	reducer.load_reducer(reducer_cache)
	except (IOError, pickle.UnpicklingError, EOFError) as e:
	logger.warning(f"Failed to load cached reduced embeddings: {e}")
	current_reduced = None

	if current_reduced is None:
	if reducer is None or reducer.method != projection_method.lower():
	reducer = DimensionReducer(method=projection_method.lower(), n_components=3)
	if projection_method.lower() == "umap":
	reducer.reducer = UMAP(
	n_components=3,
	n_neighbors=30,
	min_dist=0.3,
	metric='cosine',
	random_state=42,
	n_jobs=-1,
	low_memory=True,
	spread=1.5
	)
	current_reduced = reducer.fit_transform(current_embeddings)
	with open(reduced_cache, 'wb') as f:
	pickle.dump(current_reduced, f)
	reducer.save_reducer(reducer_cache)

	# Update global variable
	if use_graph_embeddings and deps.combined_embeddings is not None:
	deps.reduced_embeddings_graph = current_reduced
	else:
	deps.reduced_embeddings = current_reduced

	# Get indices for filtered data
	# Use model_id column to map between filtered_df and original df
	# This is safer than using index positions which can change after filtering
	filtered_model_ids = filtered_df['model_id'].astype(str).values

	# Map model_ids to positions in original df
	if df.index.name == 'model_id' or 'model_id' in df.index.names:
	# When df is indexed by model_id, use get_loc directly
	filtered_indices = []
	for model_id in filtered_model_ids:
	try:
	pos = df.index.get_loc(model_id)
	# Handle both single position and array of positions
	if isinstance(pos, (int, np.integer)):
	filtered_indices.append(int(pos))
	elif isinstance(pos, (slice, np.ndarray)):
	# If multiple matches, take first
	if isinstance(pos, slice):
	filtered_indices.append(int(pos.start))
	else:
	filtered_indices.append(int(pos[0]))
	except (KeyError, TypeError):
	continue
	filtered_indices = np.array(filtered_indices, dtype=np.int32)
	else:
	# When df is not indexed by model_id, find positions by matching model_id column
	df_model_ids = df['model_id'].astype(str).values
	model_id_to_pos = {mid: pos for pos, mid in enumerate(df_model_ids)}
	filtered_indices = np.array([
	model_id_to_pos[mid] for mid in filtered_model_ids
	if mid in model_id_to_pos
	], dtype=np.int32)

	if len(filtered_indices) == 0:
	return {
	"models": [],
	"embedding_type": embedding_type,
	"filtered_count": filtered_count,
	"returned_count": 0
	}

	filtered_reduced = current_reduced[filtered_indices]
	family_depths = calculate_family_depths(df)

	# Use appropriate embeddings for clustering
	clustering_embeddings = current_reduced
	# Compute clusters if not already computed or if size changed
	if models.cluster_labels is None or len(models.cluster_labels) != len(clustering_embeddings):
	models.cluster_labels = compute_clusters(clustering_embeddings, n_clusters=min(50, len(clustering_embeddings) // 100))

	# Handle case where cluster_labels might not match filtered data yet
	if models.cluster_labels is not None and len(models.cluster_labels) > 0:
	if len(filtered_indices) <= len(models.cluster_labels):
	filtered_clusters = models.cluster_labels[filtered_indices]
	else:
	# Fallback: use first cluster for all if indices don't match
	filtered_clusters = np.zeros(len(filtered_indices), dtype=int)
	else:
	filtered_clusters = np.zeros(len(filtered_indices), dtype=int)

	model_ids = filtered_df['model_id'].astype(str).values
	library_names = filtered_df.get('library_name', pd.Series([None] * len(filtered_df))).values
	pipeline_tags = filtered_df.get('pipeline_tag', pd.Series([None] * len(filtered_df))).values
	downloads_arr = filtered_df.get('downloads', pd.Series([0] * len(filtered_df))).fillna(0).astype(int).values
	likes_arr = filtered_df.get('likes', pd.Series([0] * len(filtered_df))).fillna(0).astype(int).values
	trending_scores = filtered_df.get('trendingScore', pd.Series([None] * len(filtered_df))).values
	tags_arr = filtered_df.get('tags', pd.Series([None] * len(filtered_df))).values
	parent_models = filtered_df.get('parent_model', pd.Series([None] * len(filtered_df))).values
	licenses_arr = filtered_df.get('licenses', pd.Series([None] * len(filtered_df))).values
	created_at_arr = filtered_df.get('createdAt', pd.Series([None] * len(filtered_df))).values

	x_coords = filtered_reduced[:, 0].astype(float)
	y_coords = filtered_reduced[:, 1].astype(float)
	z_coords = filtered_reduced[:, 2].astype(float) if filtered_reduced.shape[1] > 2 else np.zeros(len(filtered_reduced), dtype=float)
	models = [
	ModelPoint(
	model_id=model_ids[idx],
	x=float(x_coords[idx]),
	y=float(y_coords[idx]),
	z=float(z_coords[idx]),
	library_name=library_names[idx] if pd.notna(library_names[idx]) else None,
	pipeline_tag=pipeline_tags[idx] if pd.notna(pipeline_tags[idx]) else None,
	downloads=int(downloads_arr[idx]),
	likes=int(likes_arr[idx]),
	trending_score=float(trending_scores[idx]) if idx < len(trending_scores) and pd.notna(trending_scores[idx]) else None,
	tags=tags_arr[idx] if idx < len(tags_arr) and pd.notna(tags_arr[idx]) else None,
	parent_model=parent_models[idx] if idx < len(parent_models) and pd.notna(parent_models[idx]) else None,
	licenses=licenses_arr[idx] if idx < len(licenses_arr) and pd.notna(licenses_arr[idx]) else None,
	family_depth=family_depths.get(model_ids[idx], None),
	cluster_id=int(filtered_clusters[idx]) if idx < len(filtered_clusters) else None,
	created_at=str(created_at_arr[idx]) if idx < len(created_at_arr) and pd.notna(created_at_arr[idx]) else None
	)
	for idx in range(len(filtered_df))
	]

	# Return models with metadata about embedding type
	response_data = {
	"models": models,
	"embedding_type": embedding_type,
	"filtered_count": filtered_count,
	"returned_count": len(models)
	}

	# Return in requested format with caching headers
	if format == "msgpack":
	try:
	binary_data = encode_models_msgpack([m.dict() for m in models])
	return Response(
	content=binary_data,
	media_type="application/msgpack",
	headers={
	"Cache-Control": "public, max-age=300",
	"X-Content-Type-Options": "nosniff",
	"Access-Control-Expose-Headers": "Cache-Control"
	}
	)
	except Exception as e:
	logger.warning(f"MessagePack encoding failed, falling back to JSON: {e}")

	# Return JSON with caching headers
	return FastJSONResponse(
	content=response_data,
	headers={
	"Cache-Control": "public, max-age=300",
	"X-Content-Type-Options": "nosniff",
	"Access-Control-Expose-Headers": "Cache-Control"
	}
	)


	@app.get("/api/stats")
	async def get_stats():
	"""Get dataset statistics."""
	if df is None:
	raise DataNotLoadedError()

	total_models = len(df.index) if hasattr(df, 'index') else len(df)

	# Get unique licenses with counts
	licenses = {}
	if 'license' in df.columns:
	license_counts = df['license'].value_counts().to_dict()
	licenses = {str(k): int(v) for k, v in license_counts.items() if pd.notna(k) and str(k) != 'nan'}

	return {
	"total_models": total_models,
	"unique_libraries": int(df['library_name'].nunique()) if 'library_name' in df.columns else 0,
	"unique_pipelines": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,
	"unique_task_types": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0, # Alias for clarity
	"unique_licenses": len(licenses),
	"licenses": licenses, # License name -> count mapping
	"avg_downloads": float(df['downloads'].mean()) if 'downloads' in df.columns else 0,
	"avg_likes": float(df['likes'].mean()) if 'likes' in df.columns else 0
	}


	@app.get("/api/model/{model_id}")
	async def get_model_details(model_id: str):
	"""Get detailed information about a specific model."""
	if df is None:
	raise DataNotLoadedError()

	model = df[df.get('model_id', '') == model_id]
	if len(model) == 0:
	raise HTTPException(status_code=404, detail="Model not found")

	model = model.iloc[0]

	tags_str = str(model.get('tags', '')) if pd.notna(model.get('tags')) else ''
	arxiv_ids = extract_arxiv_ids(tags_str)

	papers = []
	if arxiv_ids:
	papers = await fetch_arxiv_papers(arxiv_ids[:5]) # Limit to 5 papers

	return {
	"model_id": model.get('model_id'),
	"library_name": model.get('library_name'),
	"pipeline_tag": model.get('pipeline_tag'),
	"downloads": int(model.get('downloads', 0)),
	"likes": int(model.get('likes', 0)),
	"trending_score": float(model.get('trendingScore', 0)) if pd.notna(model.get('trendingScore')) else None,
	"tags": model.get('tags') if pd.notna(model.get('tags')) else None,
	"licenses": model.get('licenses') if pd.notna(model.get('licenses')) else None,
	"parent_model": model.get('parent_model') if pd.notna(model.get('parent_model')) else None,
	"arxiv_papers": papers,
	"arxiv_ids": arxiv_ids
	}


	# Clusters endpoint is handled by routes/clusters.py router

	@app.get("/api/family/stats")
	async def get_family_stats():
	"""
	Get aggregate statistics about family trees for paper visualizations.
	Returns family size distribution, depth statistics, model card length by depth, etc.
	"""
	if df is None:
	raise DataNotLoadedError()

	family_sizes = {}
	root_models = set()

	for idx, row in df.iterrows():
	model_id = str(row.get('model_id', ''))
	parent_id = row.get('parent_model')

	if pd.isna(parent_id) or str(parent_id) == 'nan' or str(parent_id) == '':
	root_models.add(model_id)
	if model_id not in family_sizes:
	family_sizes[model_id] = 0
	else:
	parent_id_str = str(parent_id)
	root = parent_id_str
	visited = set()
	while root in df.index and pd.notna(df.loc[root].get('parent_model')):
	parent = df.loc[root].get('parent_model')
	if pd.isna(parent) or str(parent) == 'nan' or str(parent) == '':
	break
	if str(parent) in visited:
	break
	visited.add(root)
	root = str(parent)

	if root not in family_sizes:
	family_sizes[root] = 0
	family_sizes[root] += 1

	size_distribution = {}
	for root, size in family_sizes.items():
	size_distribution[size] = size_distribution.get(size, 0) + 1

	depths = calculate_family_depths(df)
	depth_counts = {}
	for depth in depths.values():
	depth_counts[depth] = depth_counts.get(depth, 0) + 1

	model_card_lengths_by_depth = {}
	if 'modelCard' in df.columns:
	for idx, row in df.iterrows():
	model_id = str(row.get('model_id', ''))
	depth = depths.get(model_id, 0)
	model_card = row.get('modelCard', '')
	if pd.notna(model_card):
	card_length = len(str(model_card))
	if depth not in model_card_lengths_by_depth:
	model_card_lengths_by_depth[depth] = []
	model_card_lengths_by_depth[depth].append(card_length)

	model_card_stats = {}
	for depth, lengths in model_card_lengths_by_depth.items():
	if lengths:
	model_card_stats[depth] = {
	"mean": float(np.mean(lengths)),
	"median": float(np.median(lengths)),
	"q1": float(np.percentile(lengths, 25)),
	"q3": float(np.percentile(lengths, 75)),
	"min": float(np.min(lengths)),
	"max": float(np.max(lengths)),
	"count": len(lengths)
	}

	return {
	"total_families": len(root_models),
	"family_size_distribution": size_distribution,
	"depth_distribution": depth_counts,
	"max_family_size": max(family_sizes.values()) if family_sizes else 0,
	"max_depth": max(depths.values()) if depths else 0,
	"avg_family_size": sum(family_sizes.values()) / len(family_sizes) if family_sizes else 0,
	"model_card_length_by_depth": model_card_stats
	}


	@app.get("/api/family/top")
	async def get_top_families(
	limit: int = Query(50, ge=1, le=200, description="Maximum number of families to return"),
	min_size: int = Query(2, ge=1, description="Minimum family size to include")
	):
	"""
	Get top families by total lineage count (sum of all descendants).
	Calculates the actual family tree size by traversing parent-child relationships.
	"""
	if deps.df is None:
	raise DataNotLoadedError()

	df = deps.df

	# Build parent -> children mapping
	children_map = {}
	root_models = set()

	for idx, row in df.iterrows():
	model_id = str(row.get('model_id', ''))
	parent_id = row.get('parent_model')

	if pd.isna(parent_id) or str(parent_id) == 'nan' or str(parent_id) == '':
	root_models.add(model_id)
	else:
	parent_str = str(parent_id)
	if parent_str not in children_map:
	children_map[parent_str] = []
	children_map[parent_str].append(model_id)

	# For each root, count all descendants
	def count_descendants(model_id: str, visited: set) -> int:
	if model_id in visited:
	return 0
	visited.add(model_id)
	count = 1 # Count self
	for child in children_map.get(model_id, []):
	count += count_descendants(child, visited)
	return count

	# Calculate family sizes
	family_data = []
	for root in root_models:
	visited = set()
	total_count = count_descendants(root, visited)
	if total_count >= min_size:
	# Get organization from model_id
	org = root.split('/')[0] if '/' in root else root
	family_data.append({
	"root_model": root,
	"organization": org,
	"total_models": total_count,
	"depth_count": len(visited) # Same as total for tree traversal
	})

	# Sort by total count descending
	family_data.sort(key=lambda x: x['total_models'], reverse=True)

	# Also aggregate by organization (sum all families under same org)
	org_totals = {}
	for fam in family_data:
	org = fam['organization']
	if org not in org_totals:
	org_totals[org] = {
	"organization": org,
	"total_models": 0,
	"family_count": 0,
	"root_models": []
	}
	org_totals[org]['total_models'] += fam['total_models']
	org_totals[org]['family_count'] += 1
	if len(org_totals[org]['root_models']) < 5: # Keep top 5 root models
	org_totals[org]['root_models'].append(fam['root_model'])

	# Sort organizations by total models
	top_orgs = sorted(org_totals.values(), key=lambda x: x['total_models'], reverse=True)[:limit]

	return {
	"families": family_data[:limit],
	"organizations": top_orgs,
	"total_families": len(family_data),
	"total_root_models": len(root_models)
	}


	@app.get("/api/family/path/{model_id}")
	async def get_family_path(
	model_id: str,
	target_id: Optional[str] = Query(None, description="Target model ID. If None, returns path to root.")
	):
	"""
	Get path from model to root or to target model.
	Returns list of model IDs representing the path.
	"""
	if df is None:
	raise DataNotLoadedError()

	model_id_str = str(model_id)

	if df.index.name == 'model_id':
	if model_id_str not in df.index:
	raise HTTPException(status_code=404, detail="Model not found")
	else:
	model_rows = df[df.get('model_id', '') == model_id_str]
	if len(model_rows) == 0:
	raise HTTPException(status_code=404, detail="Model not found")

	path = [model_id_str]
	visited = set([model_id_str])
	current = model_id_str

	if target_id:
	target_str = str(target_id)
	if df.index.name == 'model_id':
	if target_str not in df.index:
	raise HTTPException(status_code=404, detail="Target model not found")

	while current != target_str and current not in visited:
	try:
	if df.index.name == 'model_id':
	row = df.loc[current]
	else:
	rows = df[df.get('model_id', '') == current]
	if len(rows) == 0:
	break
	row = rows.iloc[0]

	parent_id = row.get('parent_model')
	if parent_id and pd.notna(parent_id):
	parent_str = str(parent_id)
	if parent_str == target_str:
	path.append(parent_str)
	break
	if parent_str not in visited:
	path.append(parent_str)
	visited.add(parent_str)
	current = parent_str
	else:
	break
	else:
	break
	except (KeyError, IndexError):
	break
	else:
	while True:
	try:
	if df.index.name == 'model_id':
	row = df.loc[current]
	else:
	rows = df[df.get('model_id', '') == current]
	if len(rows) == 0:
	break
	row = rows.iloc[0]

	parent_id = row.get('parent_model')
	if parent_id and pd.notna(parent_id):
	parent_str = str(parent_id)
	if parent_str not in visited:
	path.append(parent_str)
	visited.add(parent_str)
	current = parent_str
	else:
	break
	else:
	break
	except (KeyError, IndexError):
	break

	return {
	"path": path,
	"source": model_id_str,
	"target": target_id if target_id else "root",
	"path_length": len(path) - 1
	}


	@app.get("/api/family/{model_id}")
	async def get_family_tree(
	model_id: str,
	max_depth: Optional[int] = Query(None, ge=1, le=100, description="Maximum depth to traverse. If None, traverses entire tree without limit."),
	max_depth_filter: Optional[int] = Query(None, ge=0, description="Filter results to models at or below this hierarchy depth.")
	):
	"""
	Get family tree for a model (ancestors and descendants).
	Returns the model, its parent chain, and all children.

	If max_depth is None, traverses the entire family tree without depth limits.
	"""
	if df is None:
	raise DataNotLoadedError()

	if reduced_embeddings is None:
	raise HTTPException(status_code=503, detail="Embeddings not ready")

	model_id_str = str(model_id)

	if df.index.name == 'model_id':
	if model_id_str not in df.index:
	raise HTTPException(status_code=404, detail="Model not found")
	model_lookup = df.loc
	else:
	model_rows = df[df.get('model_id', '') == model_id_str]
	if len(model_rows) == 0:
	raise HTTPException(status_code=404, detail="Model not found")
	model_lookup = lambda x: df[df.get('model_id', '') == x]

	from utils.network_analysis import _get_all_parents, _parse_parent_list

	children_index: Dict[str, List[str]] = {}
	parent_columns = ['parent_model', 'finetune_parent', 'quantized_parent', 'adapter_parent', 'merge_parent']

	for idx, row in df.iterrows():
	model_id_from_row = str(row.get('model_id', idx))
	all_parents = _get_all_parents(row)

	for rel_type, parent_list in all_parents.items():
	for parent_str in parent_list:
	if parent_str not in children_index:
	children_index[parent_str] = []
	children_index[parent_str].append(model_id_from_row)

	visited = set()

	def get_ancestors(current_id: str, depth: Optional[int]):
	if current_id in visited:
	return
	if depth is not None and depth <= 0:
	return
	visited.add(current_id)

	try:
	if df.index.name == 'model_id':
	row = df.loc[current_id]
	else:
	rows = model_lookup(current_id)
	if len(rows) == 0:
	return
	row = rows.iloc[0]

	all_parents = _get_all_parents(row)
	for rel_type, parent_list in all_parents.items():
	for parent_str in parent_list:
	if parent_str != 'nan' and parent_str != '':
	next_depth = depth - 1 if depth is not None else None
	get_ancestors(parent_str, next_depth)
	except (KeyError, IndexError):
	return

	def get_descendants(current_id: str, depth: Optional[int]):
	if current_id in visited:
	return
	if depth is not None and depth <= 0:
	return
	visited.add(current_id)

	children = children_index.get(current_id, [])
	for child_id in children:
	if child_id not in visited:
	next_depth = depth - 1 if depth is not None else None
	get_descendants(child_id, next_depth)

	get_ancestors(model_id_str, max_depth)
	visited = set()
	get_descendants(model_id_str, max_depth)
	visited.add(model_id_str)

	if df.index.name == 'model_id':
	try:
	family_df = df.loc[list(visited)]
	except KeyError:
	missing = [v for v in visited if v not in df.index]
	if missing:
	logger.warning(f"Some family members not found in index: {missing}")
	family_df = df.loc[[v for v in visited if v in df.index]]
	else:
	family_df = df[df.get('model_id', '').isin(visited)]

	if len(family_df) == 0:
	raise HTTPException(status_code=404, detail="Family tree data not available")

	family_indices = family_df.index.values
	if len(family_indices) > len(reduced_embeddings):
	raise HTTPException(status_code=503, detail="Embedding indices mismatch")

	family_reduced = reduced_embeddings[family_indices]

	family_map = {}
	for idx, (i, row) in enumerate(family_df.iterrows()):
	model_id_val = str(row.get('model_id', i))
	parent_id = row.get('parent_model')
	parent_id_str = str(parent_id) if parent_id and pd.notna(parent_id) else None

	depths = calculate_family_depths(df)
	model_depth = depths.get(model_id_val, 0)

	if max_depth_filter is not None and model_depth > max_depth_filter:
	continue

	family_map[model_id_val] = {
	"model_id": model_id_val,
	"x": float(family_reduced[idx, 0]),
	"y": float(family_reduced[idx, 1]),
	"z": float(family_reduced[idx, 2]) if family_reduced.shape[1] > 2 else 0.0,
	"library_name": str(row.get('library_name')) if pd.notna(row.get('library_name')) else None,
	"pipeline_tag": str(row.get('pipeline_tag')) if pd.notna(row.get('pipeline_tag')) else None,
	"downloads": int(row.get('downloads', 0)) if pd.notna(row.get('downloads')) else 0,
	"likes": int(row.get('likes', 0)) if pd.notna(row.get('likes')) else 0,
	"parent_model": parent_id_str,
	"licenses": str(row.get('licenses')) if pd.notna(row.get('licenses')) else None,
	"family_depth": model_depth,
	"children": []
	}

	root_models = []
	for model_id_val, model_data in family_map.items():
	parent_id = model_data["parent_model"]
	if parent_id and parent_id in family_map:
	family_map[parent_id]["children"].append(model_id_val)
	else:
	root_models.append(model_id_val)

	return {
	"root_model": model_id_str,
	"family": list(family_map.values()),
	"family_map": family_map,
	"root_models": root_models
	}


	@app.get("/api/search")
	async def search_models(
	q: str = Query(..., min_length=1, alias="query"),
	query: str = Query(None, min_length=1),
	limit: int = Query(20, ge=1, le=100),
	graph_aware: bool = Query(False),
	include_neighbors: bool = Query(True)
	):
	"""
	Search for models by name (for autocomplete and family tree lookup).
	Enhanced with graph-aware search option that includes network relationships.
	"""
	if df is None:
	raise DataNotLoadedError()

	# Support both 'q' and 'query' parameters
	search_query = query or q

	if graph_aware:
	try:
	network_builder = ModelNetworkBuilder(df)
	top_models = network_builder.get_top_models_by_field(n=1000)
	model_ids = [mid for mid, _ in top_models]
	graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')

	results = network_builder.search_graph_aware(
	query=search_query,
	graph=graph,
	max_results=limit,
	include_neighbors=include_neighbors
	)

	return {"results": results, "search_type": "graph_aware", "query": search_query}
	except (ValueError, KeyError, AttributeError) as e:
	logger.warning(f"Graph-aware search failed, falling back to basic search: {e}")

	query_lower = search_query.lower()

	# Enhanced search: search model_id, org, tags, library, pipeline
	model_id_col = df.get('model_id', '').astype(str).str.lower()
	library_col = df.get('library_name', '').astype(str).str.lower()
	pipeline_col = df.get('pipeline_tag', '').astype(str).str.lower()
	tags_col = df.get('tags', '').astype(str).str.lower()
	license_col = df.get('license', '').astype(str).str.lower()

	# Extract org from model_id
	org_col = model_id_col.str.split('/').str[0]

	# Multi-field search
	mask = (
	model_id_col.str.contains(query_lower, na=False) \|
	org_col.str.contains(query_lower, na=False) \|
	library_col.str.contains(query_lower, na=False) \|
	pipeline_col.str.contains(query_lower, na=False) \|
	tags_col.str.contains(query_lower, na=False) \|
	license_col.str.contains(query_lower, na=False)
	)

	matches = df[mask].head(limit)

	results = []
	for _, row in matches.iterrows():
	model_id = str(row.get('model_id', ''))
	org = model_id.split('/')[0] if '/' in model_id else ''

	# Get coordinates if available
	x = float(row.get('x', 0.0)) if 'x' in row else None
	y = float(row.get('y', 0.0)) if 'y' in row else None
	z = float(row.get('z', 0.0)) if 'z' in row else None

	results.append({
	"model_id": model_id,
	"x": x,
	"y": y,
	"z": z,
	"org": org,
	"library": row.get('library_name'),
	"pipeline": row.get('pipeline_tag'),
	"license": row.get('license') if pd.notna(row.get('license')) else None,
	"downloads": int(row.get('downloads', 0)),
	"likes": int(row.get('likes', 0)),
	"parent_model": row.get('parent_model') if pd.notna(row.get('parent_model')) else None,
	"match_type": "direct"
	})

	return {"results": results, "search_type": "basic", "query": search_query}


	@app.get("/api/search/fuzzy")
	async def fuzzy_search_models(
	q: str = Query(..., min_length=2, description="Search query"),
	limit: int = Query(50, ge=1, le=200, description="Maximum number of results"),
	threshold: int = Query(60, ge=0, le=100, description="Minimum fuzzy match score (0-100)"),
	):
	"""
	Fuzzy search for models using rapidfuzz.
	Handles typos and partial matches across model names, libraries, and pipelines.
	Returns results sorted by relevance score.
	"""
	if deps.df is None:
	raise DataNotLoadedError()

	df = deps.df

	try:
	from rapidfuzz import fuzz, process
	from rapidfuzz.utils import default_process

	query_lower = q.lower().strip()

	# Prepare choices - combine model_id, library, and pipeline for searching
	# Create a searchable string for each model
	model_ids = df['model_id'].astype(str).tolist()
	libraries = df.get('library_name', pd.Series([''] * len(df))).fillna('').astype(str).tolist()
	pipelines = df.get('pipeline_tag', pd.Series([''] * len(df))).fillna('').astype(str).tolist()

	# Create search strings - just model_id for better fuzzy matching
	# Library and pipeline are used for secondary filtering
	search_strings = [m.lower() for m in model_ids]

	# Use rapidfuzz to find best matches
	# WRatio is best for general fuzzy matching with typo tolerance
	# It handles transpositions, insertions, deletions well

	# extract returns list of (match, score, index)
	matches = process.extract(
	query_lower,
	search_strings,
	scorer=fuzz.WRatio,
	limit=limit * 3, # Get extra to filter by threshold and dedupe
	score_cutoff=threshold,
	processor=default_process
	)

	# Also try partial matching for substring searches
	if len(matches) < limit:
	partial_matches = process.extract(
	query_lower,
	search_strings,
	scorer=fuzz.partial_ratio,
	limit=limit * 2,
	score_cutoff=threshold + 10, # Higher threshold for partial
	processor=default_process
	)
	# Add unique partial matches
	seen_indices = {m[2] for m in matches}
	for m in partial_matches:
	if m[2] not in seen_indices:
	matches.append(m)
	seen_indices.add(m[2])

	results = []
	seen_ids = set()

	for match_str, score, idx in matches:
	if len(results) >= limit:
	break

	model_id = model_ids[idx]
	if model_id in seen_ids:
	continue
	seen_ids.add(model_id)

	row = df.iloc[idx]

	# Get coordinates
	x = float(row.get('x', 0.0)) if 'x' in row else None
	y = float(row.get('y', 0.0)) if 'y' in row else None
	z = float(row.get('z', 0.0)) if 'z' in row else None

	results.append({
	"model_id": model_id,
	"x": x,
	"y": y,
	"z": z,
	"score": round(score, 1),
	"library": row.get('library_name') if pd.notna(row.get('library_name')) else None,
	"pipeline": row.get('pipeline_tag') if pd.notna(row.get('pipeline_tag')) else None,
	"downloads": int(row.get('downloads', 0)),
	"likes": int(row.get('likes', 0)),
	"family_depth": int(row.get('family_depth', 0)) if pd.notna(row.get('family_depth')) else None,
	})

	# Sort by score descending, then by downloads for tie-breaking
	results.sort(key=lambda x: (-x['score'], -x['downloads']))

	return {
	"results": results,
	"query": q,
	"total_matches": len(matches),
	"threshold": threshold
	}

	except ImportError:
	raise HTTPException(status_code=500, detail="rapidfuzz not installed")
	except Exception as e:
	logger.exception(f"Fuzzy search error: {e}")
	raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")


	@app.get("/api/similar/{model_id}")
	async def get_similar_models(model_id: str, k: int = Query(10, ge=1, le=50)):
	"""
	Get k-nearest neighbors of a model based on embedding similarity.
	Returns similar models with distance scores.
	"""
	if deps.df is None or deps.embeddings is None:
	raise HTTPException(status_code=503, detail="Data not loaded")

	df = deps.df
	embeddings = deps.embeddings

	if 'model_id' in df.index.names or df.index.name == 'model_id':
	try:
	model_row = df.loc[[model_id]]
	model_idx = model_row.index[0]
	except KeyError:
	raise HTTPException(status_code=404, detail="Model not found")
	else:
	model_row = df[df.get('model_id', '') == model_id]
	if len(model_row) == 0:
	raise HTTPException(status_code=404, detail="Model not found")
	model_idx = model_row.index[0]
	model_embedding = embeddings[model_idx]

	from sklearn.metrics.pairwise import cosine_similarity
	model_embedding_2d = model_embedding.reshape(1, -1)
	similarities = cosine_similarity(model_embedding_2d, embeddings)[0]

	top_k_indices = np.argpartition(similarities, -k-1)[-k-1:-1]
	top_k_indices = top_k_indices[np.argsort(similarities[top_k_indices])][::-1]

	similar_models = []
	for idx in top_k_indices:
	if idx == model_idx:
	continue
	row = df.iloc[idx]
	similar_models.append({
	"model_id": row.get('model_id', 'Unknown'),
	"similarity": float(similarities[idx]),
	"distance": float(1 - similarities[idx]),
	"library_name": row.get('library_name'),
	"pipeline_tag": row.get('pipeline_tag'),
	"downloads": int(row.get('downloads', 0)),
	"likes": int(row.get('likes', 0)),
	})

	return {
	"query_model": model_id,
	"similar_models": similar_models
	}


	@app.get("/api/models/semantic-similarity")
	async def get_models_by_semantic_similarity(
	query_model_id: str = Query(...),
	k: int = Query(100, ge=1, le=1000),
	min_downloads: int = Query(0),
	min_likes: int = Query(0),
	projection_method: str = Query("umap")
	):
	"""
	Get models sorted by semantic similarity to a query model.
	Returns models with their similarity scores and coordinates.
	Useful for exploring the embedding space around a specific model.
	"""
	if deps.df is None or deps.embeddings is None:
	raise HTTPException(status_code=503, detail="Data not loaded")

	df = deps.df
	embeddings = deps.embeddings

	# Find the query model
	if 'model_id' in df.index.names or df.index.name == 'model_id':
	try:
	model_row = df.loc[[query_model_id]]
	model_idx = model_row.index[0]
	except KeyError:
	raise HTTPException(status_code=404, detail="Query model not found")
	else:
	model_row = df[df.get('model_id', '') == query_model_id]
	if len(model_row) == 0:
	raise HTTPException(status_code=404, detail="Query model not found")
	model_idx = model_row.index[0]

	query_embedding = embeddings[model_idx]

	filtered_df = data_loader.filter_data(
	df=df,
	min_downloads=min_downloads,
	min_likes=min_likes,
	search_query=None,
	libraries=None,
	pipeline_tags=None
	)

	if df.index.name == 'model_id' or 'model_id' in df.index.names:
	filtered_indices = [df.index.get_loc(idx) for idx in filtered_df.index]
	filtered_indices = np.array(filtered_indices, dtype=int)
	else:
	filtered_indices = filtered_df.index.values.astype(int)

	filtered_embeddings = embeddings[filtered_indices]
	from sklearn.metrics.pairwise import cosine_similarity
	query_embedding_2d = query_embedding.reshape(1, -1)
	similarities = cosine_similarity(query_embedding_2d, filtered_embeddings)[0]

	top_k_local_indices = np.argpartition(similarities, -k)[-k:]
	top_k_local_indices = top_k_local_indices[np.argsort(similarities[top_k_local_indices])][::-1]

	if reduced_embeddings is None:
	raise HTTPException(status_code=503, detail="Reduced embeddings not ready")

	top_k_original_indices = filtered_indices[top_k_local_indices]
	top_k_reduced = reduced_embeddings[top_k_original_indices]

	similar_models = []
	for i, orig_idx in enumerate(top_k_original_indices):
	row = df.iloc[orig_idx]
	local_idx = top_k_local_indices[i]
	similar_models.append({
	"model_id": str(row.get('model_id', 'Unknown')),
	"x": float(top_k_reduced[i, 0]),
	"y": float(top_k_reduced[i, 1]),
	"z": float(top_k_reduced[i, 2]) if top_k_reduced.shape[1] > 2 else 0.0,
	"similarity": float(similarities[local_idx]),
	"distance": float(1 - similarities[local_idx]),
	"library_name": str(row.get('library_name')) if pd.notna(row.get('library_name')) else None,
	"pipeline_tag": str(row.get('pipeline_tag')) if pd.notna(row.get('pipeline_tag')) else None,
	"downloads": int(row.get('downloads', 0)),
	"likes": int(row.get('likes', 0)),
	"trending_score": float(row.get('trendingScore')) if pd.notna(row.get('trendingScore')) else None,
	"tags": str(row.get('tags')) if pd.notna(row.get('tags')) else None,
	"parent_model": str(row.get('parent_model')) if pd.notna(row.get('parent_model')) else None,
	"licenses": str(row.get('licenses')) if pd.notna(row.get('licenses')) else None,
	})

	return {
	"query_model": query_model_id,
	"models": similar_models,
	"count": len(similar_models)
	}


	@app.get("/api/distance")
	async def get_distance(
	model_id_1: str = Query(...),
	model_id_2: str = Query(...)
	):
	"""
	Calculate distance/similarity between two models.
	"""
	if deps.df is None or deps.embeddings is None:
	raise HTTPException(status_code=503, detail="Data not loaded")

	df = deps.df
	embeddings = deps.embeddings

	# Find both models - optimized with index lookup
	if 'model_id' in df.index.names or df.index.name == 'model_id':
	try:
	model1_row = df.loc[[model_id_1]]
	model2_row = df.loc[[model_id_2]]
	idx1 = model1_row.index[0]
	idx2 = model2_row.index[0]
	except KeyError:
	raise HTTPException(status_code=404, detail="One or both models not found")
	else:
	model1_row = df[df.get('model_id', '') == model_id_1]
	model2_row = df[df.get('model_id', '') == model_id_2]
	if len(model1_row) == 0 or len(model2_row) == 0:
	raise HTTPException(status_code=404, detail="One or both models not found")
	idx1 = model1_row.index[0]
	idx2 = model2_row.index[0]

	from sklearn.metrics.pairwise import cosine_similarity
	similarity = cosine_similarity([embeddings[idx1]], [embeddings[idx2]])[0][0]
	distance = 1 - similarity

	return {
	"model_1": model_id_1,
	"model_2": model_id_2,
	"cosine_similarity": float(similarity),
	"cosine_distance": float(distance),
	"euclidean_distance": float(np.linalg.norm(embeddings[idx1] - embeddings[idx2]))
	}


	@app.post("/api/export")
	async def export_models(model_ids: List[str]):
	"""
	Export selected models as JSON with full metadata.
	"""
	if df is None:
	raise DataNotLoadedError()

	# Optimized export with index lookup
	if 'model_id' in df.index.names or df.index.name == 'model_id':
	try:
	exported = df.loc[model_ids]
	except KeyError:
	# Fallback if some IDs not in index
	exported = df[df.get('model_id', '').isin(model_ids)]
	else:
	exported = df[df.get('model_id', '').isin(model_ids)]

	if len(exported) == 0:
	return {"models": []}

	models = [
	{
	"model_id": str(row.get('model_id', '')),
	"library_name": str(row.get('library_name')) if pd.notna(row.get('library_name')) else None,
	"pipeline_tag": str(row.get('pipeline_tag')) if pd.notna(row.get('pipeline_tag')) else None,
	"downloads": int(row.get('downloads', 0)) if pd.notna(row.get('downloads')) else 0,
	"likes": int(row.get('likes', 0)) if pd.notna(row.get('likes')) else 0,
	"trending_score": float(row.get('trendingScore', 0)) if pd.notna(row.get('trendingScore')) else None,
	"tags": str(row.get('tags')) if pd.notna(row.get('tags')) else None,
	"licenses": str(row.get('licenses')) if pd.notna(row.get('licenses')) else None,
	"parent_model": str(row.get('parent_model')) if pd.notna(row.get('parent_model')) else None,
	}
	for _, row in exported.iterrows()
	]

	return {
	"count": len(models),
	"models": models
	}


	@app.get("/api/network/cooccurrence")
	async def get_cooccurrence_network(
	library: Optional[str] = Query(None),
	pipeline_tag: Optional[str] = Query(None),
	min_downloads: int = Query(0),
	min_likes: int = Query(0),
	n: int = Query(100, ge=1, le=1000),
	cooccurrence_method: str = Query("combined", regex="^(parent_family\|library\|pipeline\|tags\|combined)$")
	):
	"""
	Build co-occurrence network for models (inspired by Open Syllabus Project).
	Connects models that appear together in same contexts (parent family, library, pipeline, tags).

	Returns network graph data suitable for visualization.
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	network_builder = ModelNetworkBuilder(df)
	top_models = network_builder.get_top_models_by_field(
	library=library,
	pipeline_tag=pipeline_tag,
	min_downloads=min_downloads,
	min_likes=min_likes,
	n=n
	)

	if not top_models:
	return {
	"nodes": [],
	"links": [],
	"statistics": {}
	}

	model_ids = [mid for mid, _ in top_models]
	graph = network_builder.build_cooccurrence_network(
	model_ids=model_ids,
	cooccurrence_method=cooccurrence_method
	)

	nodes = []
	for node_id, attrs in graph.nodes(data=True):
	nodes.append({
	"id": node_id,
	"title": attrs.get('title', node_id),
	"author": attrs.get('author', ''),
	"freq": attrs.get('freq', 0),
	"likes": attrs.get('likes', 0),
	"library": attrs.get('library', ''),
	"pipeline": attrs.get('pipeline', '')
	})

	links = []
	for source, target, attrs in graph.edges(data=True):
	links.append({
	"source": source,
	"target": target,
	"weight": attrs.get('weight', 1)
	})

	stats = network_builder.get_network_statistics(graph)

	return {
	"nodes": nodes,
	"links": links,
	"statistics": stats
	}
	except (ValueError, KeyError, AttributeError) as e:
	logger.error(f"Error building network: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Error building network: {str(e)}")


	@app.get("/api/network/family/{model_id}")
	async def get_family_network(
	model_id: str,
	max_depth: Optional[int] = Query(None, ge=1, le=100, description="Maximum depth to traverse. If None, traverses entire tree without limit."),
	edge_types: Optional[str] = Query(None, description="Comma-separated list of edge types to include (finetune,quantized,adapter,merge,parent). If None, includes all types."),
	include_edge_attributes: bool = Query(True, description="Whether to include edge attributes (change in likes, downloads, etc.)")
	):
	"""
	Build family tree network for a model (directed graph).
	Returns network graph data showing parent-child relationships with multiple relationship types.
	Supports filtering by edge type (finetune, quantized, adapter, merge, parent).
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	filter_types = None
	if edge_types:
	filter_types = [t.strip() for t in edge_types.split(',') if t.strip()]

	network_builder = ModelNetworkBuilder(df)
	graph = network_builder.build_family_tree_network(
	root_model_id=model_id,
	max_depth=max_depth,
	include_edge_attributes=include_edge_attributes,
	filter_edge_types=filter_types
	)

	nodes = []
	for node_id, attrs in graph.nodes(data=True):
	nodes.append({
	"id": node_id,
	"title": attrs.get('title', node_id),
	"freq": attrs.get('freq', 0),
	"likes": attrs.get('likes', 0),
	"downloads": attrs.get('downloads', 0),
	"library": attrs.get('library', ''),
	"pipeline": attrs.get('pipeline', '')
	})

	links = []
	for source, target, edge_attrs in graph.edges(data=True):
	link_data = {
	"source": source,
	"target": target,
	"edge_type": edge_attrs.get('edge_type'),
	"edge_types": edge_attrs.get('edge_types', [])
	}

	if include_edge_attributes:
	link_data.update({
	"change_in_likes": edge_attrs.get('change_in_likes'),
	"percentage_change_in_likes": edge_attrs.get('percentage_change_in_likes'),
	"change_in_downloads": edge_attrs.get('change_in_downloads'),
	"percentage_change_in_downloads": edge_attrs.get('percentage_change_in_downloads'),
	"change_in_createdAt_days": edge_attrs.get('change_in_createdAt_days')
	})

	links.append(link_data)

	stats = network_builder.get_network_statistics(graph)

	return {
	"nodes": nodes,
	"links": links,
	"statistics": stats,
	"root_model": model_id
	}
	except (ValueError, KeyError, AttributeError) as e:
	logger.error(f"Error building family network: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Error building family network: {str(e)}")


	@app.get("/api/network/full-derivatives")
	@cached_response(ttl=3600, key_prefix="full_derivatives_network")
	async def get_full_derivative_network(
	edge_types: Optional[str] = Query(None, description="Comma-separated list of edge types to include (finetune,quantized,adapter,merge,parent). If None, includes all types."),
	include_edge_attributes: bool = Query(False, description="Whether to include edge attributes (change in likes, downloads, etc.). Default False for performance."),
	include_positions: bool = Query(True, description="Whether to include pre-computed 3D positions for each node. Default True for faster rendering."),
	min_downloads: int = Query(0, description="Minimum downloads to include a model. Use this to reduce network size."),
	max_nodes: Optional[int] = Query(None, ge=100, le=1000000, description="Maximum number of nodes to include. Models are sorted by downloads. Use this to reduce network size."),
	use_precomputed: bool = Query(True, description="Try to load pre-computed network graph from disk if available.")
	):
	"""
	Build full derivative relationship network for ALL models in the database.
	Returns a non-embedding based force-directed graph where edges represent derivative types.
	This computes over every single model in the database.

	Note: Edge attributes are disabled by default for performance with large datasets.
	If pre-computed positions exist, they will be included in the response.
	"""
	if deps.df is None or deps.df.empty:
	raise HTTPException(
	status_code=503,
	detail="Model data not loaded. Please wait for the server to finish loading data."
	)

	try:
	import time
	import networkx as nx
	start_time = time.time()

	# Check if dataframe has required columns
	required_columns = ['model_id']
	missing_columns = [col for col in required_columns if col not in deps.df.columns]
	if missing_columns:
	raise HTTPException(
	status_code=500,
	detail=f"Missing required columns: {missing_columns}"
	)

	# Try to load pre-computed network graph
	graph = None
	if use_precomputed:
	try:
	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)
	precomputed_dir = os.path.join(root_dir, "precomputed_data")
	graph_file = os.path.join(precomputed_dir, "full_derivative_network.pkl")

	# Try to download from HF Hub if not found locally (for Spaces deployment)
	if not os.path.exists(graph_file):
	logger.info("Pre-computed network not found locally. Attempting to download from HF Hub...")
	from utils.precomputed_loader import download_network_from_hf_hub
	download_network_from_hf_hub(precomputed_dir, version="v1")

	if os.path.exists(graph_file):
	logger.info(f"Loading pre-computed network graph from {graph_file}...")
	with open(graph_file, 'rb') as f:
	graph = pickle.load(f)
	logger.info(f"Loaded pre-computed graph: {graph.number_of_nodes():,} nodes, {graph.number_of_edges():,} edges")
	else:
	logger.info("Pre-computed network graph not available. Will build from scratch.")
	except Exception as e:
	logger.warning(f"Could not load pre-computed network graph: {e}. Will build from scratch.")

	# Filter dataframe if needed
	filtered_df = deps.df.copy()
	if min_downloads > 0:
	filtered_df = filtered_df[filtered_df.get('downloads', 0) >= min_downloads]
	logger.info(f"Filtered to {len(filtered_df):,} models with >= {min_downloads} downloads")

	if max_nodes and len(filtered_df) > max_nodes:
	# Sort by downloads and take top N
	filtered_df = filtered_df.nlargest(max_nodes, 'downloads', keep='first')
	logger.info(f"Limited to top {max_nodes:,} models by downloads")

	logger.info(f"Building full derivative network for {len(filtered_df):,} models...")

	filter_types = None
	if edge_types:
	filter_types = [t.strip() for t in edge_types.split(',') if t.strip()]

	# Build graph if not loaded from disk
	if graph is None:
	try:
	network_builder = ModelNetworkBuilder(filtered_df)
	logger.info("Calling build_full_derivative_network...")

	# Disable edge attributes for very large graphs to improve performance
	# They can be slow to compute for 100k+ edges
	graph = network_builder.build_full_derivative_network(
	include_edge_attributes=include_edge_attributes,
	filter_edge_types=filter_types
	)
	except Exception as build_error:
	logger.error(f"Error in build_full_derivative_network: {build_error}", exc_info=True)
	raise HTTPException(
	status_code=500,
	detail=f"Failed to build network graph: {str(build_error)}"
	)
	else:
	# Filter pre-computed graph if needed
	if filter_types:
	# Remove edges that don't match filter
	edges_to_remove = []
	for source, target, attrs in graph.edges(data=True):
	edge_types_list = attrs.get('edge_types', [])
	if not isinstance(edge_types_list, list):
	edge_types_list = [edge_types_list] if edge_types_list else []
	if not any(et in filter_types for et in edge_types_list):
	edges_to_remove.append((source, target))
	graph.remove_edges_from(edges_to_remove)
	# Remove isolated nodes
	isolated = list(nx.isolates(graph))
	graph.remove_nodes_from(isolated)
	logger.info(f"Filtered graph: {graph.number_of_nodes():,} nodes, {graph.number_of_edges():,} edges")

	# Filter nodes by downloads if needed
	if min_downloads > 0 or max_nodes:
	nodes_to_remove = []
	for node_id in graph.nodes():
	if node_id in filtered_df.index:
	continue
	nodes_to_remove.append(node_id)
	graph.remove_nodes_from(nodes_to_remove)
	isolated = list(nx.isolates(graph))
	graph.remove_nodes_from(isolated)
	logger.info(f"Filtered graph by model selection: {graph.number_of_nodes():,} nodes, {graph.number_of_edges():,} edges")

	build_time = time.time() - start_time
	logger.info(f"Graph built in {build_time:.2f}s: {graph.number_of_nodes():,} nodes, {graph.number_of_edges():,} edges")

	# Load pre-computed positions if available
	precomputed_positions = {}
	if include_positions:
	try:
	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)
	layout_file = os.path.join(root_dir, "precomputed_data", "force_layout_3d.pkl")

	if os.path.exists(layout_file):
	with open(layout_file, 'rb') as f:
	layout_data = pickle.load(f)
	precomputed_positions = layout_data.get('positions', {})
	logger.info(f"Loaded {len(precomputed_positions):,} pre-computed positions")
	except Exception as e:
	logger.warning(f"Could not load pre-computed positions: {e}")

	# Build nodes list with optional pre-computed positions
	nodes = []
	for node_id, attrs in graph.nodes(data=True):
	node_data = {
	"id": node_id,
	"title": attrs.get('title', node_id),
	"freq": attrs.get('freq', 0),
	"likes": attrs.get('likes', 0),
	"downloads": attrs.get('downloads', 0),
	"library": attrs.get('library', ''),
	"pipeline": attrs.get('pipeline', '')
	}

	# Add pre-computed position if available
	if node_id in precomputed_positions:
	pos = precomputed_positions[node_id]
	node_data['x'] = pos[0]
	node_data['y'] = pos[1]
	node_data['z'] = pos[2]

	nodes.append(node_data)

	logger.info(f"Processed {len(nodes):,} nodes")

	# Build links list
	links = []
	edge_count = 0
	for source, target, edge_attrs in graph.edges(data=True):
	link_data = {
	"source": source,
	"target": target,
	"edge_type": edge_attrs.get('edge_type'),
	"edge_types": edge_attrs.get('edge_types', [])
	}

	if include_edge_attributes:
	link_data.update({
	"change_in_likes": edge_attrs.get('change_in_likes'),
	"percentage_change_in_likes": edge_attrs.get('percentage_change_in_likes'),
	"change_in_downloads": edge_attrs.get('change_in_downloads'),
	"percentage_change_in_downloads": edge_attrs.get('percentage_change_in_downloads'),
	"change_in_createdAt_days": edge_attrs.get('change_in_createdAt_days')
	})

	links.append(link_data)
	edge_count += 1
	if edge_count % 10000 == 0:
	logger.info(f"Processed {edge_count:,} edges...")

	logger.info(f"Processed {len(links):,} links")

	try:
	stats = network_builder.get_network_statistics(graph)
	except Exception as stats_error:
	logger.warning(f"Could not calculate network statistics: {stats_error}")
	stats = {
	"nodes": len(nodes),
	"edges": len(links),
	"density": 0.0,
	"avg_degree": 0.0,
	"clustering": 0.0
	}

	total_time = time.time() - start_time
	logger.info(f"Full derivative network built successfully in {total_time:.2f}s")

	return {
	"nodes": nodes,
	"links": links,
	"statistics": stats
	}
	except HTTPException:
	# Re-raise HTTP exceptions as-is
	raise
	except DataNotLoadedError:
	raise HTTPException(
	status_code=503,
	detail="Model data not loaded. Please wait for the server to finish loading data."
	)
	except Exception as e:
	import traceback
	error_trace = traceback.format_exc()
	logger.error(f"Error building full derivative network: {e}\n{error_trace}")
	error_detail = f"Error building full derivative network: {str(e)}"
	if isinstance(e, (ValueError, KeyError, AttributeError)):
	error_detail += f" (Type: {type(e).__name__})"
	# Provide more helpful error message
	if "memory" in str(e).lower() or "MemoryError" in str(type(e)):
	error_detail += ". The dataset may be too large. Try filtering by edge types."
	raise HTTPException(status_code=500, detail=error_detail)


	@app.get("/api/search/neighbors/{model_id}")
	async def get_model_neighbors(
	model_id: str,
	max_neighbors: int = Query(50, ge=1, le=200),
	min_weight: float = Query(0.0, ge=0.0)
	):
	"""
	Find neighbors of a model in the co-occurrence network (graph-based search).
	Similar to graph database queries for finding connected nodes.
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	network_builder = ModelNetworkBuilder(df)
	top_models = network_builder.get_top_models_by_field(n=1000)
	model_ids = [mid for mid, _ in top_models]
	graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')

	neighbors = network_builder.find_neighbors(
	model_id=model_id,
	graph=graph,
	max_neighbors=max_neighbors,
	min_weight=min_weight
	)

	return {
	"model_id": model_id,
	"neighbors": neighbors,
	"count": len(neighbors)
	}
	except (ValueError, KeyError, AttributeError) as e:
	logger.error(f"Error finding neighbors: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Error finding neighbors: {str(e)}")


	@app.get("/api/search/path")
	async def find_path_between_models(
	source_id: str = Query(...),
	target_id: str = Query(...),
	max_path_length: int = Query(5, ge=1, le=10)
	):
	"""
	Find shortest path between two models (graph-based search).
	Similar to graph database path queries.
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	network_builder = ModelNetworkBuilder(df)
	# Build network for top models (for performance)
	top_models = network_builder.get_top_models_by_field(n=1000)
	model_ids = [mid for mid, _ in top_models]
	graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')

	path = network_builder.find_path(
	source_id=source_id,
	target_id=target_id,
	graph=graph,
	max_path_length=max_path_length
	)

	if path is None:
	return {
	"source_id": source_id,
	"target_id": target_id,
	"path": None,
	"path_length": None,
	"found": False
	}

	return {
	"source_id": source_id,
	"target_id": target_id,
	"path": path,
	"path_length": len(path) - 1,
	"found": True
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error finding path: {str(e)}")


	@app.get("/api/search/cooccurrence/{model_id}")
	async def search_by_cooccurrence(
	model_id: str,
	max_results: int = Query(20, ge=1, le=100),
	min_weight: float = Query(1.0, ge=0.0)
	):
	"""
	Search for models that co-occur with a query model.
	Similar to graph database queries for co-assignment patterns.
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	network_builder = ModelNetworkBuilder(df)
	# Build network for top models (for performance)
	top_models = network_builder.get_top_models_by_field(n=1000)
	model_ids = [mid for mid, _ in top_models]
	graph = network_builder.build_cooccurrence_network(model_ids, cooccurrence_method='combined')

	results = network_builder.search_by_cooccurrence(
	query_model_id=model_id,
	graph=graph,
	max_results=max_results,
	min_weight=min_weight
	)

	return {
	"query_model": model_id,
	"cooccurring_models": results,
	"count": len(results)
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error searching by co-occurrence: {str(e)}")


	@app.get("/api/search/relationships/{model_id}")
	async def get_model_relationships(
	model_id: str,
	relationship_type: str = Query("all", regex="^(family\|library\|pipeline\|tags\|all)$"),
	max_results: int = Query(50, ge=1, le=200)
	):
	"""
	Find models by specific relationship types (family, library, pipeline, tags).
	Similar to graph database relationship queries.
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	network_builder = ModelNetworkBuilder(df)
	related_models = network_builder.find_models_by_relationship(
	model_id=model_id,
	relationship_type=relationship_type,
	max_results=max_results
	)

	return {
	"model_id": model_id,
	"relationship_type": relationship_type,
	"related_models": related_models,
	"count": len(related_models)
	}

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error finding relationships: {str(e)}")


	@app.get("/api/model-count/current")
	async def get_current_model_count(
	use_cache: bool = Query(True),
	force_refresh: bool = Query(False),
	use_dataset_snapshot: bool = Query(False),
	use_models_page: bool = Query(True)
	):
	"""
	Get the current number of models on Hugging Face Hub.
	Uses multiple strategies: models page scraping (fastest), dataset snapshot, or API.

	Query Parameters:
	use_cache: Use cached results if available (default: True)
	force_refresh: Force refresh even if cache is valid (default: False)
	use_dataset_snapshot: Use dataset snapshot for breakdowns (default: False)
	use_models_page: Try to get count from HF models page first (default: True)
	"""
	try:
	tracker = get_tracker()

	if use_dataset_snapshot:
	count_data = tracker.get_count_from_models_page()
	if count_data is None:
	count_data = tracker.get_current_model_count(use_models_page=False)
	else:
	try:
	from utils.data_loader import ModelDataLoader
	data_loader = ModelDataLoader()
	df = data_loader.load_data(sample_size=10000, prioritize_base_models=True)
	library_counts = {}
	pipeline_counts = {}

	for _, row in df.iterrows():
	if pd.notna(row.get('library_name')):
	lib = str(row.get('library_name'))
	library_counts[lib] = library_counts.get(lib, 0) + 1
	if pd.notna(row.get('pipeline_tag')):
	pipeline = str(row.get('pipeline_tag'))
	pipeline_counts[pipeline] = pipeline_counts.get(pipeline, 0) + 1

	if len(df) > 0 and count_data["total_models"] > len(df):
	scale_factor = count_data["total_models"] / len(df)
	library_counts = {k: int(v * scale_factor) for k, v in library_counts.items()}
	pipeline_counts = {k: int(v * scale_factor) for k, v in pipeline_counts.items()}

	count_data["models_by_library"] = library_counts
	count_data["models_by_pipeline"] = pipeline_counts
	except Exception as e:
	logger.warning(f"Could not get breakdowns from dataset: {e}")
	else:
	count_data = tracker.get_current_model_count(use_models_page=use_models_page)

	return count_data
	except Exception as e:
	logger.error(f"Error fetching model count: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Error fetching model count: {str(e)}")


	@app.get("/api/model-count/historical")
	async def get_historical_model_counts(
	days: int = Query(30, ge=1, le=365),
	start_date: Optional[str] = Query(None),
	end_date: Optional[str] = Query(None)
	):
	"""
	Get historical model counts.

	Args:
	days: Number of days to look back (if start_date not provided)
	start_date: Start date in ISO format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)
	end_date: End date in ISO format (YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS)
	"""
	try:
	from datetime import datetime

	tracker = get_tracker()

	start = None
	end = None

	if start_date:
	start = datetime.fromisoformat(start_date.replace('Z', '+00:00'))
	if end_date:
	end = datetime.fromisoformat(end_date.replace('Z', '+00:00'))

	if start is None:
	from datetime import timedelta
	start = datetime.utcnow() - timedelta(days=days)

	historical = tracker.get_historical_counts(start, end)

	return {
	"counts": historical,
	"count": len(historical),
	"start_date": start.isoformat() if start else None,
	"end_date": end.isoformat() if end else None
	}
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error fetching historical counts: {str(e)}")


	@app.get("/api/model-count/latest")
	async def get_latest_model_count():
	"""Get the most recently recorded model count from database."""
	try:
	tracker = get_tracker()
	latest = tracker.get_latest_count()
	if latest is None:
	raise HTTPException(status_code=404, detail="No model counts recorded yet")
	return latest
	except HTTPException:
	raise
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error fetching latest count: {str(e)}")


	@app.post("/api/model-count/record")
	async def record_model_count(
	background_tasks: BackgroundTasks,
	use_dataset_snapshot: bool = Query(False, description="Use dataset snapshot instead of API (faster)")
	):
	"""
	Record the current model count to the database.
	This can be called periodically (e.g., via cron job) to track growth over time.

	Query Parameters:
	use_dataset_snapshot: Use dataset snapshot instead of API (faster, default: False)
	"""
	try:
	tracker = get_tracker()

	def record():
	if use_dataset_snapshot:
	count_data = tracker.get_count_from_dataset_snapshot()
	if count_data:
	tracker.record_count(count_data, source="dataset_snapshot")
	else:
	count_data = tracker.get_current_model_count(use_cache=False)
	tracker.record_count(count_data, source="api")
	else:
	count_data = tracker.get_current_model_count(use_cache=False)
	tracker.record_count(count_data, source="api")

	background_tasks.add_task(record)

	return {
	"status": "recording",
	"message": "Model count recording started in background",
	"source": "dataset_snapshot" if use_dataset_snapshot else "api"
	}
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error recording model count: {str(e)}")


	@app.get("/api/model-count/growth")
	async def get_growth_stats(days: int = Query(7, ge=1, le=365)):
	"""
	Get growth statistics over the specified period.

	Args:
	days: Number of days to analyze
	"""
	try:
	tracker = get_tracker()
	stats = tracker.get_growth_stats(days)
	return stats
	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Error calculating growth stats: {str(e)}")


	@app.get("/api/network/export/graphml")
	async def export_network_graphml(
	background_tasks: BackgroundTasks,
	library: Optional[str] = Query(None),
	pipeline_tag: Optional[str] = Query(None),
	min_downloads: int = Query(0),
	min_likes: int = Query(0),
	n: int = Query(100, ge=1, le=1000),
	cooccurrence_method: str = Query("combined", regex="^(parent_family\|library\|pipeline\|tags\|combined)$")
	):
	"""
	Export co-occurrence network as GraphML file (for import into Gephi, Cytoscape, etc.).
	Similar to Open Syllabus graph export functionality.
	"""
	if df is None:
	raise DataNotLoadedError()

	try:
	network_builder = ModelNetworkBuilder(df)

	top_models = network_builder.get_top_models_by_field(
	library=library,
	pipeline_tag=pipeline_tag,
	min_downloads=min_downloads,
	min_likes=min_likes,
	n=n
	)

	if not top_models:
	raise HTTPException(status_code=404, detail="No models found matching criteria")

	model_ids = [mid for mid, _ in top_models]
	graph = network_builder.build_cooccurrence_network(
	model_ids=model_ids,
	cooccurrence_method=cooccurrence_method
	)

	with tempfile.NamedTemporaryFile(mode='w', suffix='.graphml', delete=False) as tmp_file:
	tmp_path = tmp_file.name
	network_builder.export_graphml(graph, tmp_path)

	background_tasks.add_task(os.unlink, tmp_path)

	return FileResponse(
	tmp_path,
	media_type='application/xml',
	filename=f'network_{cooccurrence_method}_{n}_models.graphml'
	)
	except (ValueError, KeyError, AttributeError, IOError) as e:
	logger.error(f"Error exporting network: {e}", exc_info=True)
	raise HTTPException(status_code=500, detail=f"Error exporting network: {str(e)}")


	@app.get("/api/model/{model_id}/papers")
	async def get_model_papers(model_id: str):
	"""
	Get arXiv papers associated with a model.
	Extracts arXiv IDs from model tags and fetches paper information.
	"""
	if df is None:
	raise DataNotLoadedError()

	model = df[df.get('model_id', '') == model_id]
	if len(model) == 0:
	raise HTTPException(status_code=404, detail="Model not found")

	model = model.iloc[0]

	# Extract arXiv IDs from tags
	tags_str = str(model.get('tags', '')) if pd.notna(model.get('tags')) else ''
	arxiv_ids = extract_arxiv_ids(tags_str)

	if not arxiv_ids:
	return {
	"model_id": model_id,
	"arxiv_ids": [],
	"papers": []
	}

	# Fetch papers
	papers = await fetch_arxiv_papers(arxiv_ids[:10]) # Limit to 10 papers

	return {
	"model_id": model_id,
	"arxiv_ids": arxiv_ids,
	"papers": papers
	}


	@app.get("/api/models/minimal.bin")
	async def get_minimal_binary():
	"""
	Serve the binary minimal dataset file.
	This is optimized for fast client-side loading.
	"""
	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)
	binary_path = os.path.join(root_dir, "cache", "binary", "embeddings.bin")

	if not os.path.exists(binary_path):
	raise HTTPException(status_code=404, detail="Binary dataset not found. Run export_binary.py first.")

	return FileResponse(
	binary_path,
	media_type="application/octet-stream",
	headers={
	"Content-Disposition": "attachment; filename=embeddings.bin",
	"Cache-Control": "public, max-age=3600"
	}
	)


	@app.get("/api/models/model_ids.json")
	async def get_model_ids_json():
	"""Serve the model IDs JSON file."""
	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)
	json_path = os.path.join(root_dir, "cache", "binary", "model_ids.json")

	if not os.path.exists(json_path):
	raise HTTPException(status_code=404, detail="Model IDs file not found.")

	return FileResponse(
	json_path,
	media_type="application/json",
	headers={"Cache-Control": "public, max-age=3600"}
	)


	@app.get("/api/models/metadata.json")
	async def get_metadata_json():
	"""Serve the metadata JSON file with lookup tables."""
	backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	root_dir = os.path.dirname(backend_dir)
	json_path = os.path.join(root_dir, "cache", "binary", "metadata.json")

	if not os.path.exists(json_path):
	raise HTTPException(status_code=404, detail="Metadata file not found.")

	return FileResponse(
	json_path,
	media_type="application/json",
	headers={"Cache-Control": "public, max-age=3600"}
	)


	@app.get("/api/model/{model_id}/files")
	async def get_model_files(model_id: str, branch: str = Query("main")):
	"""
	Get file tree for a model from Hugging Face.
	Proxies the request to avoid CORS issues.
	Returns a flat list of files with path and size information.
	"""
	if not model_id or not model_id.strip():
	raise HTTPException(status_code=400, detail="Invalid model ID")

	branches_to_try = [branch, "main", "master"] if branch not in ["main", "master"] else [branch, "main" if branch == "master" else "master"]

	try:
	async with httpx.AsyncClient(timeout=15.0) as client:
	for branch_name in branches_to_try:
	try:
	url = f"https://huggingface.co/api/models/{model_id}/tree/{branch_name}"
	response = await client.get(url)

	if response.status_code == 200:
	data = response.json()
	# Ensure we return an array
	if isinstance(data, list):
	return data
	elif isinstance(data, dict) and 'tree' in data:
	return data['tree']
	else:
	return []

	elif response.status_code == 404:
	# Try next branch
	continue
	else:
	logger.warning(f"Unexpected status {response.status_code} for {url}")
	continue

	except httpx.HTTPStatusError as e:
	if e.response.status_code == 404:
	continue # Try next branch
	logger.warning(f"HTTP error for branch {branch_name}: {e}")
	continue
	except httpx.HTTPError as e:
	logger.warning(f"HTTP error for branch {branch_name}: {e}")
	continue

	# All branches failed
	raise HTTPException(
	status_code=404,
	detail=f"File tree not found for model '{model_id}'. The model may not exist or may not have any files."
	)

	except httpx.TimeoutException:
	raise HTTPException(
	status_code=504,
	detail="Request to Hugging Face timed out. Please try again later."
	)
	except HTTPException:
	raise # Re-raise HTTP exceptions
	except Exception as e:
	logger.error(f"Error fetching file tree: {e}", exc_info=True)
	raise HTTPException(
	status_code=500,
	detail=f"Error fetching file tree: {str(e)}"
	)


	# =============================================================================
	# BACKGROUND COMPUTATION ENDPOINTS
	# =============================================================================

	import subprocess
	import threading

	# Store for background process
	_background_process = None
	_background_lock = threading.Lock()


	class ComputeRequest(BaseModel):
	sample_size: Optional[int] = None
	all_models: bool = False


	@app.get("/api/compute/status")
	async def get_compute_status():
	"""Get the status of background pre-computation."""
	from pathlib import Path

	root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	status_file = Path(root_dir) / "precomputed_data" / "background_status_v1.json"

	if status_file.exists():
	import json
	with open(status_file, 'r') as f:
	status = json.load(f)

	# Check if process is still running
	global _background_process
	with _background_lock:
	if _background_process is not None:
	poll = _background_process.poll()
	if poll is None:
	status['process_running'] = True
	else:
	status['process_running'] = False
	status['process_exit_code'] = poll
	else:
	status['process_running'] = False

	return status

	# Check for existing precomputed data
	metadata_file = Path(root_dir) / "precomputed_data" / "metadata_v1.json"
	models_file = Path(root_dir) / "precomputed_data" / "models_v1.parquet"

	if metadata_file.exists() and models_file.exists():
	import json
	with open(metadata_file, 'r') as f:
	metadata = json.load(f)
	return {
	'status': 'completed',
	'total_models': metadata.get('total_models', 0),
	'created_at': metadata.get('created_at'),
	'process_running': False
	}

	return {
	'status': 'not_started',
	'total_models': 0,
	'process_running': False
	}


	@app.post("/api/compute/start")
	async def start_background_compute(request: ComputeRequest, background_tasks: BackgroundTasks):
	"""Start background pre-computation of model embeddings."""
	global _background_process

	with _background_lock:
	if _background_process is not None and _background_process.poll() is None:
	raise HTTPException(
	status_code=409,
	detail="Background computation is already running"
	)

	# Prepare command
	root_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	script_path = os.path.join(root_dir, "backend", "scripts", "precompute_background.py")
	venv_python = os.path.join(root_dir, "venv", "bin", "python")

	cmd = [venv_python, script_path]

	if request.all_models:
	cmd.append("--all")
	elif request.sample_size:
	cmd.extend(["--sample-size", str(request.sample_size)])
	else:
	cmd.extend(["--sample-size", "150000"]) # Default

	cmd.extend(["--output-dir", os.path.join(root_dir, "precomputed_data")])

	# Start process in background
	log_file = os.path.join(root_dir, "precompute_background.log")

	def run_computation():
	global _background_process
	with open(log_file, 'w') as f:
	with _background_lock:
	_background_process = subprocess.Popen(
	cmd,
	stdout=f,
	stderr=subprocess.STDOUT,
	cwd=os.path.join(root_dir, "backend")
	)
	_background_process.wait()

	thread = threading.Thread(target=run_computation, daemon=True)
	thread.start()

	sample_desc = "all models" if request.all_models else f"{request.sample_size or 150000:,} models"

	return {
	"message": f"Background computation started for {sample_desc}",
	"status": "starting",
	"log_file": log_file
	}


	@app.post("/api/compute/stop")
	async def stop_background_compute():
	"""Stop the running background computation."""
	global _background_process

	with _background_lock:
	if _background_process is None or _background_process.poll() is not None:
	return {"message": "No computation is running"}

	_background_process.terminate()
	try:
	_background_process.wait(timeout=5)
	except subprocess.TimeoutExpired:
	_background_process.kill()

	return {"message": "Background computation stopped"}


	@app.get("/api/data/info")
	async def get_data_info():
	"""Get information about currently loaded data."""
	df = deps.df

	if df is None:
	return {
	"loaded": False,
	"message": "No data loaded"
	}

	return {
	"loaded": True,
	"total_models": len(df),
	"columns": list(df.columns),
	"unique_libraries": int(df['library_name'].nunique()) if 'library_name' in df.columns else 0,
	"unique_pipelines": int(df['pipeline_tag'].nunique()) if 'pipeline_tag' in df.columns else 0,
	"has_3d_coords": all(col in df.columns for col in ['x_3d', 'y_3d', 'z_3d']),
	"has_2d_coords": all(col in df.columns for col in ['x_2d', 'y_2d'])
	}


	# =============================================================================
	# STATIC FILE SERVING (for HF Spaces full-stack deployment)
	# =============================================================================

	from fastapi.staticfiles import StaticFiles
	from starlette.responses import FileResponse as StarletteFileResponse

	# Check if frontend build exists (for HF Spaces deployment)
	frontend_build_path = os.path.join(os.path.dirname(backend_dir), "frontend", "build")
	if os.path.exists(frontend_build_path):
	# Serve static files from React build
	app.mount("/static", StaticFiles(directory=os.path.join(frontend_build_path, "static")), name="static")

	@app.get("/{full_path:path}")
	async def serve_frontend(full_path: str):
	"""Serve React frontend for non-API routes."""
	# Don't serve frontend for API routes
	if full_path.startswith("api/") or full_path == "docs" or full_path == "openapi.json":
	raise HTTPException(status_code=404, detail="Not found")

	# Try to serve the requested file
	file_path = os.path.join(frontend_build_path, full_path)
	if os.path.isfile(file_path):
	return StarletteFileResponse(file_path)

	# Fall back to index.html for SPA routing
	index_path = os.path.join(frontend_build_path, "index.html")
	if os.path.exists(index_path):
	return StarletteFileResponse(index_path)

	raise HTTPException(status_code=404, detail="Not found")


	if __name__ == "__main__":
	import uvicorn
	port = int(os.getenv("PORT", 8000))
	uvicorn.run(app, host="0.0.0.0", port=port)