davanstrien's picture
davanstrien HF Staff
db: prefer bucket-mounted /data/lineage.db with baked-in fallback
c084cd8 verified
"""Dataset Lineage Explorer — FastAPI + SQLite (read-only) + vis-network.
- Overview: analysis dashboard (relationship-type mix, most-copied, most-derived,
most-translated, most-prolific producers) over the inferred lineage graph.
- Explorer: search/pick a dataset -> interactive lineage graph + related lists.
- API: GET /related?dataset=<id>, /api/graph, /analysis, /examples, /stats.
DB is opened read-only; lineage.db is baked into the repo (small, robust).
Edward Tufte-inspired styling.
"""
import json
import sqlite3
from functools import lru_cache
from pathlib import Path
from fastapi import FastAPI, Query, Request
from fastapi.middleware.gzip import GZipMiddleware
from fastapi.responses import HTMLResponse, JSONResponse
# Prefer the bucket-mounted DB (/data/lineage.db, persistent + can grow);
# fall back to the baked-in copy if the mount isn't present (cold-mount fail,
# local dev, etc.) so the Space stays up either way.
BUCKET_DB = Path("/data/lineage.db")
LOCAL_DB = Path(__file__).parent / "lineage.db"
DB = BUCKET_DB if BUCKET_DB.exists() else LOCAL_DB
print(f"using DB: {DB} (bucket-mount={BUCKET_DB.exists()})", flush=True)
# High safety ceiling only — stops a depth-2 hub exploding to tens of thousands
# of nodes. Normal hubs (incl. FLAN's ~1.4k children) render fully.
MAX_NODES = 2500
app = FastAPI(title="Dataset Lineage Explorer")
app.add_middleware(GZipMiddleware, minimum_size=500)
_con = sqlite3.connect(f"file:{DB}?mode=ro", uri=True, check_same_thread=False)
_con.row_factory = sqlite3.Row
@app.middleware("http")
async def cache_headers(request: Request, call_next):
resp = await call_next(request)
if request.url.path.startswith(("/api", "/related", "/examples", "/stats", "/analysis")):
resp.headers["Cache-Control"] = "public, max-age=86400"
return resp
@lru_cache(maxsize=None)
def _meta(dataset: str) -> dict:
r = _con.execute("SELECT * FROM nodes WHERE dataset_id=?", (dataset,)).fetchone()
return dict(r) if r else {"dataset_id": dataset, "downloads": 0, "likes": 0, "author": ""}
@lru_cache(maxsize=None)
def parents(dataset: str) -> list[dict]:
rows = _con.execute(
"SELECT parent AS id, primary_type, confidence, tags, size_ratio, source "
"FROM edges WHERE child=? ORDER BY confidence DESC", (dataset,)
).fetchall()
return [dict(r) for r in rows]
@lru_cache(maxsize=None)
def children(dataset: str) -> list[dict]:
rows = _con.execute(
"SELECT child AS id, primary_type, confidence, tags, size_ratio, source "
"FROM edges WHERE parent=? ORDER BY confidence DESC", (dataset,)
).fetchall()
return [dict(r) for r in rows]
def siblings(dataset: str) -> list[dict]:
rows = _con.execute(
"SELECT DISTINCT e2.child AS id, e2.parent AS via, e2.primary_type "
"FROM edges e1 JOIN edges e2 ON e1.parent=e2.parent "
"WHERE e1.child=? AND e2.child!=? LIMIT 80", (dataset, dataset)
).fetchall()
return [dict(r) for r in rows]
def related(dataset: str) -> dict:
return {"dataset": dataset, "meta": _meta(dataset), "parents": parents(dataset),
"children": children(dataset), "siblings": siblings(dataset)}
@app.get("/related")
def related_endpoint(dataset: str = Query(..., description="dataset id, e.g. tatsu-lab/alpaca")):
"""JSON of parents/children/siblings — for metadata workflows."""
return JSONResponse(related(dataset))
@app.get("/api/graph")
def graph(dataset: str, depth: int = 2):
seen_edges, nodes = [], set()
frontier_up, frontier_down = {dataset}, {dataset}
nodes.add(dataset)
for _ in range(max(1, min(depth, 4))):
nxt = set()
for d in frontier_up:
for p in parents(d):
seen_edges.append((p["id"], d, p["primary_type"], p["confidence"], p["source"]))
if p["id"] not in nodes and len(nodes) < MAX_NODES:
nodes.add(p["id"])
nxt.add(p["id"])
frontier_up = nxt
if len(nodes) >= MAX_NODES:
break
for _ in range(max(1, min(depth, 4))):
nxt = set()
for d in frontier_down:
for c in children(d):
seen_edges.append((d, c["id"], c["primary_type"], c["confidence"], c["source"]))
if c["id"] not in nodes and len(nodes) < MAX_NODES:
nodes.add(c["id"])
nxt.add(c["id"])
frontier_down = nxt
if len(nodes) >= MAX_NODES:
break
edges_seen = {(a, b): (t, cf, s) for a, b, t, cf, s in seen_edges}
node_list = [{"id": n, "label": n.split("/")[-1], "title": f"{n} ({_meta(n)['downloads']:,} dl)",
"downloads": _meta(n)["downloads"], "focus": n == dataset} for n in nodes]
edge_list = [{"from": a, "to": b, "type": t, "confidence": cf, "source": s}
for (a, b), (t, cf, s) in edges_seen.items() if a in nodes and b in nodes]
return {"nodes": node_list, "edges": edge_list}
@lru_cache(maxsize=1)
def _stats():
n = _con.execute("SELECT count(*) FROM nodes").fetchone()[0]
e = _con.execute("SELECT count(*) FROM edges").fetchone()[0]
inf = _con.execute("SELECT count(*) FROM edges WHERE source='inferred'").fetchone()[0]
deriv = _con.execute("SELECT count(DISTINCT child) FROM edges WHERE source='inferred'").fetchone()[0]
return {"nodes": n, "edges": e, "inferred": inf, "derivatives": deriv}
@lru_cache(maxsize=8)
def _examples(n: int):
rows = _con.execute(
"SELECT parent AS id, count(*) AS derivatives FROM edges "
"WHERE source='inferred' GROUP BY parent ORDER BY derivatives DESC LIMIT ?", (n,)
).fetchall()
return [dict(r) for r in rows]
@lru_cache(maxsize=1)
def _analysis():
def top(sql, *a):
return [dict(r) for r in _con.execute(sql, a).fetchall()]
return {
"type_dist": top("SELECT primary_type t, count(*) n FROM edges WHERE source='inferred' "
"GROUP BY t ORDER BY n DESC"),
"most_copied": top("SELECT parent id, count(*) n FROM edges WHERE primary_type='exact_copy' "
"GROUP BY parent ORDER BY n DESC LIMIT 12"),
"most_derived": top("SELECT parent id, count(*) n FROM edges WHERE source='inferred' "
"GROUP BY parent ORDER BY n DESC LIMIT 12"),
"most_translated": top("SELECT parent id, count(*) n FROM edges WHERE primary_type='translation' "
"GROUP BY parent ORDER BY n DESC LIMIT 10"),
"top_orgs": top("SELECT substr(child,1,instr(child,'/')-1) id, count(*) n FROM edges "
"WHERE source='inferred' AND instr(child,'/')>0 GROUP BY id ORDER BY n DESC LIMIT 12"),
}
@lru_cache(maxsize=1)
def _map():
"""Overview map: multi-level chains (a derivative that is itself derived-from)
+ the top fan-out hubs. Nodes sized by derivative count."""
counts = dict(_con.execute(
"SELECT parent, count(*) FROM edges WHERE source='inferred' GROUP BY parent"
).fetchall())
backbone = _con.execute(
"SELECT parent, child, primary_type, confidence FROM edges WHERE source='inferred' "
"AND child IN (SELECT DISTINCT parent FROM edges WHERE source='inferred')"
).fetchall()
nodeset, edges = set(), []
for r in backbone:
nodeset.add(r["parent"])
nodeset.add(r["child"])
edges.append({"from": r["parent"], "to": r["child"],
"type": r["primary_type"], "confidence": r["confidence"]})
for pid, _n in sorted(counts.items(), key=lambda x: -x[1])[:60]:
nodeset.add(pid)
nodes = [{"id": n, "label": n.split("/")[-1], "derivatives": counts.get(n, 0),
"downloads": _meta(n)["downloads"]} for n in nodeset]
return {"nodes": nodes, "edges": edges}
@app.get("/api/map")
def api_map():
return _map()
@app.get("/stats")
def stats():
return _stats()
@app.get("/examples")
def examples(n: int = 24):
return _examples(n)
@app.get("/analysis")
def analysis():
"""Aggregate analysis of the inferred lineage graph."""
return _analysis()
@app.get("/", response_class=HTMLResponse)
def index():
return (INDEX_HTML
.replace("__STATS__", json.dumps(_stats()))
.replace("__EXAMPLES__", json.dumps(_examples(24)))
.replace("__ANALYSIS__", json.dumps(_analysis())))
INDEX_HTML = """<!doctype html><html><head><meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Dataset Lineage Explorer</title>
<script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script>
<style>
:root{--paper:#fffff8;--ink:#111;--muted:#6f6f66;--rule:#dcdccf;--accent:#c1440e;--bar:#43586b}
*{box-sizing:border-box}
body{margin:0;background:var(--paper);color:var(--ink);
font-family:Palatino,"Palatino Linotype","Book Antiqua",Georgia,serif;font-size:16px;line-height:1.5}
header{padding:22px 34px 14px;border-bottom:1px solid var(--rule)}
h1{font-weight:400;font-size:23px;margin:0;display:inline}
.sub{font-style:italic;color:var(--muted);font-size:14px;margin:3px 0 12px}
nav{display:flex;gap:18px;align-items:flex-end;flex-wrap:wrap}
nav .tab{font-size:15px;cursor:pointer;border-bottom:1px solid transparent;padding-bottom:2px}
nav .tab.on{border-color:var(--ink)}
.controls{display:flex;gap:18px;align-items:flex-end;margin-left:auto;flex-wrap:wrap}
label{display:block;font-size:11px;color:var(--muted);letter-spacing:.05em;margin-bottom:2px}
input,select{font-family:inherit;font-size:15px;background:var(--paper);border:0;
border-bottom:1px solid var(--ink);color:var(--ink);padding:3px 2px}
input{width:250px}
button{font-family:inherit;font-size:15px;background:none;border:0;color:var(--accent);
border-bottom:1px solid var(--accent);cursor:pointer;padding:0 0 2px}
#overview{padding:26px 40px 60px;overflow:auto;height:calc(100vh - 132px)}
.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(360px,1fr));gap:34px 50px;max-width:1180px}
.fig h3{font-weight:400;font-size:13px;letter-spacing:.09em;text-transform:uppercase;color:var(--muted);
margin:0 0 3px;border-bottom:1px solid var(--rule);padding-bottom:3px}
.cap{font-style:italic;color:var(--muted);font-size:12.5px;margin:0 0 9px}
table.bars{width:100%;border-collapse:collapse;font-size:14px}
table.bars td{padding:2.5px 6px;vertical-align:middle}
td.lbl{white-space:nowrap;max-width:230px;overflow:hidden;text-overflow:ellipsis}
td.bar{width:46%}
td.bar span{display:block;height:11px;background:var(--bar);opacity:.85}
td.num{text-align:right;color:var(--muted);font-variant-numeric:tabular-nums;width:48px}
#wrap{display:flex;height:calc(100vh - 132px);position:relative}
#net{flex:2.3;min-width:0}
#legend{position:absolute;left:14px;bottom:14px;background:rgba(255,255,248,.94);
border:1px solid var(--rule);padding:8px 11px;font-size:11px;line-height:1.75;max-width:210px;
box-shadow:0 1px 3px rgba(0,0,0,.06);user-select:none}
#legend .lt{white-space:nowrap;cursor:pointer;padding:1px 4px;margin:0 -4px;border-radius:3px}
#legend .lt:hover{background:#f1f1e6}
#legend .lt.off{opacity:.35;text-decoration:line-through}
#legend .reset{margin-top:5px;padding-top:4px;border-top:1px solid var(--rule);
font-style:italic;color:var(--muted);cursor:pointer}
#legend .reset:hover{color:var(--ink)}
@media(max-width:760px){#legend{display:none}}
#side{flex:1;max-width:440px;overflow:auto;padding:22px 28px;border-left:1px solid var(--rule)}
#side h2{font-weight:400;font-size:19px;margin:0;word-break:break-all}
.meta{color:var(--muted);font-style:italic;font-size:13px;margin:4px 0}
#side h3{font-weight:400;font-size:12px;letter-spacing:.1em;text-transform:uppercase;color:var(--muted);
margin:22px 0 7px;border-bottom:1px solid var(--rule);padding-bottom:3px}
.row{padding:4px 0;font-size:15px}
.dot{display:inline-block;width:8px;height:8px;border-radius:50%;margin-right:8px;vertical-align:middle}
.ty{font-style:italic;color:var(--muted);font-size:12.5px;margin-left:7px}
a{color:var(--ink);text-decoration:none;border-bottom:1px solid var(--rule);cursor:pointer}
a:hover{border-color:var(--ink)}
.meta a{border:0;color:var(--accent)}
a.hub{border:0;color:var(--accent);font-size:11px;margin-left:4px;padding:0 3px;opacity:.7}
a.hub:hover{opacity:1}
code{font-family:ui-monospace,monospace;font-size:12.5px;background:#f1f1e6;padding:1px 5px}
.placeholder{color:var(--muted);font-style:italic}
</style></head><body>
<header>
<h1>Dataset Lineage Explorer</h1>
<div class="sub" id="sub"></div>
<nav>
<span class="tab on" id="tabOv" onclick="showOverview()">Overview</span>
<span class="tab" id="tabEx" onclick="showExplorer()">Explorer</span>
<span class="controls">
<span><label>dataset</label><input id="q" value="tatsu-lab/alpaca" spellcheck="false"
onkeydown="if(event.key==='Enter')explore()"></span>
<span><label>or a much-derived dataset</label>
<select id="ex"><option value="">— examples —</option></select></span>
<button onclick="explore()">explore →</button>
</span>
</nav>
</header>
<div id="overview"></div>
<div id="wrap" style="display:none"><div id="net"></div><div id="legend"></div><div id="side"></div></div>
<script>
const STATS=__STATS__, EXAMPLES=__EXAMPLES__, ANALYSIS=__ANALYSIS__;
document.getElementById('sub').textContent =
STATS.nodes.toLocaleString()+' datasets · '+STATS.inferred.toLocaleString()+
' inferred lineage edges · '+STATS.derivatives.toLocaleString()+' datasets are derivatives — all recovered from content';
const exSel=document.getElementById('ex');
for(const e of EXAMPLES){const o=document.createElement('option');o.value=e.id;
o.textContent=e.id+' ('+e.derivatives+' derivatives)';exSel.appendChild(o);}
exSel.onchange=()=>{if(exSel.value){document.getElementById('q').value=exSel.value;explore();}};
const COLORS={exact_copy:'#8a8a80',filtered_subset:'#4a6b8a',subset_modified:'#5a7a9a',
subset_reformatted:'#39597a',cleaned:'#6a8a5a',regenerated_variant:'#b08a3e',
modified_variant:'#9a8a6a',augmentation:'#7a6a9a',augmentation_modified:'#8a7aaa',
translation:'#a8566a',reformat:'#4a8a8a',partial_overlap:'#aaaaa0',declared:'#c8c8b8'};
// corner colour key — clickable filter (toggle type visibility)
const LEGEND_TYPES=['exact_copy','filtered_subset','subset_modified','subset_reformatted','cleaned',
'regenerated_variant','augmentation','augmentation_modified','translation','reformat','modified_variant',
'partial_overlap','combined','declared'];
const ACTIVE_TYPES=new Set(LEGEND_TYPES);
let currentNetwork=null;
(function(){let h='<div style="font-style:italic;color:var(--muted);margin-bottom:3px">relationship · click to narrow</div>';
for(const t of LEGEND_TYPES)
h+='<div class="lt" data-type="'+t+'"><span class="dot" style="background:'+(COLORS[t]||'#999')+'"></span>'+
t.replace(/_/g,' ')+(t==='declared'?' (dashed)':'')+'</div>';
h+='<div class="reset" data-reset="1">show all</div>';
document.getElementById('legend').innerHTML=h;})();
let filterMode=false; // false=show-all default; flips after first click
function applyFilter(){
if(!currentNetwork)return;
const edgesDS=currentNetwork.body.data.edges, nodesDS=currentNetwork.body.data.nodes;
const allEdges=edgesDS.get();
edgesDS.update(allEdges.map(e=>({id:e.id, hidden:!ACTIVE_TYPES.has(e.__type)})));
// hide nodes that have NO visible incident edge (except the focus)
const visible=new Set();
for(const e of allEdges) if(ACTIVE_TYPES.has(e.__type)){visible.add(e.from);visible.add(e.to);}
nodesDS.update(nodesDS.get().map(n=>({id:n.id, hidden:!visible.has(n.id) && !n.__focus})));
}
document.getElementById('legend').addEventListener('click',ev=>{
const reset=ev.target.closest('[data-reset]');
if(reset){LEGEND_TYPES.forEach(t=>ACTIVE_TYPES.add(t));
document.querySelectorAll('#legend .lt').forEach(el=>el.classList.remove('off'));
filterMode=false;applyFilter();return;}
const lt=ev.target.closest('.lt');
if(!lt||!lt.dataset.type)return;
const t=lt.dataset.type;
if(!filterMode){
// first click → narrow to ONLY this type; subsequent clicks add more
ACTIVE_TYPES.clear();ACTIVE_TYPES.add(t);
document.querySelectorAll('#legend .lt').forEach(el=>el.classList.toggle('off', el.dataset.type!==t));
filterMode=true;
} else if(ACTIVE_TYPES.has(t)){
// already in view → remove
ACTIVE_TYPES.delete(t);lt.classList.add('off');
if(ACTIVE_TYPES.size===0){
// emptied → return to show-all
LEGEND_TYPES.forEach(x=>ACTIVE_TYPES.add(x));
document.querySelectorAll('#legend .lt').forEach(el=>el.classList.remove('off'));
filterMode=false;
}
} else {
// add to view
ACTIVE_TYPES.add(t);lt.classList.remove('off');
}
applyFilter();
});
// ---------- Overview ----------
function bars(rows,opts){
opts=opts||{};const max=Math.max(...rows.map(r=>r.n),1);let h='<table class="bars">';
for(const r of rows){const w=Math.round(100*r.n/max);
let lbl;
if(opts.type){
lbl='<span class="dot" style="background:'+(COLORS[r.t]||'#999')+'"></span>'+r.t.replace(/_/g,' ');
} else {
const isDataset=r.id.includes('/');
const hubUrl='https://huggingface.co/'+(isDataset?'datasets/':'')+r.id;
lbl=(isDataset?'<a data-ds="'+r.id+'">'+r.id+'</a>':'<span>'+r.id+'</span>')+
'<a class="hub" href="'+hubUrl+'" target="_blank" rel="noopener" title="open on HF Hub">↗</a>';
}
const col=opts.type?(COLORS[r.t]||'#999'):'var(--bar)';
h+='<tr><td class="lbl">'+lbl+'</td><td class="bar"><span style="width:'+w+'%;background:'+col+'"></span></td>'
+'<td class="num">'+r.n.toLocaleString()+'</td></tr>';}
return h+'</table>';
}
function fig(title,cap,body){return '<div class="fig"><h3>'+title+'</h3><div class="cap">'+cap+'</div>'+body+'</div>';}
function showOverview(){
document.getElementById('wrap').style.display='none';
const ov=document.getElementById('overview');ov.style.display='block';
document.getElementById('tabOv').classList.add('on');document.getElementById('tabEx').classList.remove('on');
if(ov.dataset.done)return;
ov.innerHTML='<div class="grid">'+
fig('Relationship types','How '+STATS.inferred.toLocaleString()+' undeclared derivatives were made.',bars(ANALYSIS.type_dist,{type:1}))+
fig('Most re-uploaded','Datasets with the most exact copies — redundant re-uploads on the Hub.',bars(ANALYSIS.most_copied))+
fig('Most-derived datasets','The source datasets the community builds on most.',bars(ANALYSIS.most_derived))+
fig('Most-translated','Datasets ported to the most other languages.',bars(ANALYSIS.most_translated))+
fig('Most prolific producers','Orgs/users that publish the most derivative datasets.',bars(ANALYSIS.top_orgs))+
'</div><p class="cap" style="margin-top:26px;max-width:760px">Figures are over the analysed sample of high-popularity source datasets, not the whole Hub. Click any dataset to open it in the Explorer.</p>';
ov.dataset.done='1';
}
// ---------- Explorer ----------
let network;
function dot(t){return '<span class="dot" style="background:'+(COLORS[t]||'#999')+'"></span>';}
function explore(){const ds=document.getElementById('q').value.trim();if(ds)showExplorer(ds);}
function showExplorer(ds){
document.getElementById('overview').style.display='none';
document.getElementById('wrap').style.display='flex';
document.getElementById('tabEx').classList.add('on');document.getElementById('tabOv').classList.remove('on');
if(ds){document.getElementById('q').value=ds;go();}
else{map();} // default Explorer = overall map, zoom in by clicking
}
async function map(){
const g=await fetch('/api/map').then(x=>x.json());
const nodes=g.nodes.map(n=>({id:n.id,label:(n.derivatives>=8)?n.label:undefined,
title:n.id+' · '+n.derivatives+' derivatives · '+(n.downloads||0).toLocaleString()+' downloads',
shape:'dot',size:6+Math.sqrt(n.derivatives||1)*2.4,
color:{background:'#43586b',border:'#43586b'},
font:{color:'#111',face:'Palatino, Georgia, serif',size:13,strokeWidth:4,strokeColor:'#fffff8'}}));
const edges=g.edges.map(e=>({from:e.from,to:e.to,__type:e.type,arrows:{to:{scaleFactor:.4}},
color:{color:COLORS[e.type]||'#999',opacity:.9},width:1.6,title:e.type+' · '+e.confidence}));
network=new vis.Network(document.getElementById('net'),{nodes,edges},
{physics:{stabilization:{iterations:160,updateInterval:50},
barnesHut:{gravitationalConstant:-14000,springLength:95,centralGravity:.12,avoidOverlap:.2}},
nodes:{borderWidth:0},edges:{smooth:false},interaction:{hover:true,hideEdgesOnDrag:true,tooltipDelay:120}});
network.on('stabilizationIterationsDone',()=>network.setOptions({physics:false}));
network.on('click',p=>{if(p.nodes.length)showExplorer(p.nodes[0]);});
currentNetwork=network;applyFilter();
document.getElementById('side').innerHTML='<h2>Lineage map</h2><div class="meta">'+
g.nodes.length.toLocaleString()+' source datasets · '+g.edges.length+' multi-level chain links</div>'+
'<p class="row">Node <b>size</b> = number of derivatives. Edge <b>colour</b> = relationship type (key, lower-left). '+
'Large dots are heavily-reused datasets; connected runs are multi-step lineages. '+
'<b>Click any node</b> to open its full lineage.</p>';
}
async function go(){
const ds=document.getElementById('q').value.trim();if(!ds)return;
const [g,r]=await Promise.all([
fetch('/api/graph?dataset='+encodeURIComponent(ds)+'&depth=2').then(x=>x.json()),
fetch('/related?dataset='+encodeURIComponent(ds)).then(x=>x.json())]);
const top=new Set(g.nodes.slice().sort((a,b)=>(b.downloads||0)-(a.downloads||0)).slice(0,40).map(n=>n.id));
const big=g.nodes.length>250;
const nodes=g.nodes.map(n=>({id:n.id,__focus:n.focus,label:(n.focus||top.has(n.id)||!big)?n.label:undefined,title:n.title,shape:'dot',
size:n.focus?16:(6+Math.log10((n.downloads||0)+10)*2.2),
color:{background:n.focus?'#c1440e':'#43586b',border:n.focus?'#c1440e':'#43586b'},
font:{color:'#111',face:'Palatino, Georgia, serif',size:n.focus?16:12,strokeWidth:4,strokeColor:'#fffff8'}}));
const edges=g.edges.map(e=>({from:e.from,to:e.to,__type:e.type,arrows:{to:{scaleFactor:.45}},
color:{color:COLORS[e.type]||'#999',opacity:.8},width:e.source==='inferred'?1.2:.6,
dashes:e.source==='declared',title:e.type+' · conf '+e.confidence+' · '+e.source}));
// adaptive layout: hub-shaped (many children of focus) -> force (spreads radially);
// chain-shaped (multi-level) -> hierarchical LR (reads as a tree).
const fanout=g.edges.filter(e=>e.from===ds||e.to===ds).length;
const useForce=fanout>25||g.nodes.length>180;
const opts=useForce?{
physics:{stabilization:{iterations:200,updateInterval:50},
barnesHut:{gravitationalConstant:-12000,springLength:90,centralGravity:.05,
avoidOverlap:.35,damping:.4}},
nodes:{borderWidth:0},edges:{smooth:false,width:.6,color:{opacity:.55}},
interaction:{hover:!big,tooltipDelay:120,hideEdgesOnDrag:true}}:{
layout:{hierarchical:{enabled:true,direction:'LR',sortMethod:'directed',
levelSeparation:230,nodeSpacing:16,treeSpacing:36,blockShifting:true,edgeMinimization:true}},
physics:false,nodes:{borderWidth:0},edges:{smooth:false},
interaction:{hover:!big,tooltipDelay:120,hideEdgesOnDrag:true}};
network=new vis.Network(document.getElementById('net'),{nodes,edges},opts);
if(useForce)network.on('stabilizationIterationsDone',()=>network.setOptions({physics:false}));
network.on('click',p=>{if(p.nodes.length){showExplorer(p.nodes[0]);}});
currentNetwork=network;applyFilter();
renderSide(r);
}
function lst(title,arr){
if(!arr||!arr.length)return '';
let h='<h3>'+title+' · '+arr.length+'</h3>';
for(const x of arr){const t=x.primary_type||'';
h+='<div class="row">'+dot(t)+'<a data-ds="'+x.id+'">'+x.id+'</a>'+
'<a class="hub" href="https://huggingface.co/datasets/'+x.id+'" target="_blank" rel="noopener" title="open on HF Hub">↗</a>'+
(t?'<span class="ty">'+t.replace(/_/g,' ')+(x.size_ratio?' ×'+x.size_ratio:'')+'</span>':'')+'</div>';}
return h;
}
function renderSide(r){
const m=r.meta||{};
let h='<h2>'+r.dataset+'</h2><div class="meta">'+(m.downloads||0).toLocaleString()+' downloads'+
(m.author?' · '+m.author:'')+' · <a href="https://huggingface.co/datasets/'+r.dataset+'" target="_blank">Hub ↗</a></div>';
h+=lst('Derived from — parents',r.parents);
h+=lst('Derivatives — children',r.children);
h+=lst('Siblings — share a parent',r.siblings);
if(!r.parents.length&&!r.children.length&&!r.siblings.length)
h+='<p class="placeholder">No lineage recorded for this dataset in the current sample.</p>';
h+='<h3>query API</h3><div class="row"><code>/related?dataset='+r.dataset+'</code></div>';
document.getElementById('side').innerHTML=h;
}
// any dataset link anywhere -> open in explorer
document.addEventListener('click',e=>{const a=e.target.closest('a[data-ds]');
if(a){e.preventDefault();showExplorer(a.getAttribute('data-ds'));}});
showOverview();
</script></body></html>"""
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)