davanstrien HF Staff
db: prefer bucket-mounted /data/lineage.db with baked-in fallback
c084cd8 verified | """Dataset Lineage Explorer — FastAPI + SQLite (read-only) + vis-network. | |
| - Overview: analysis dashboard (relationship-type mix, most-copied, most-derived, | |
| most-translated, most-prolific producers) over the inferred lineage graph. | |
| - Explorer: search/pick a dataset -> interactive lineage graph + related lists. | |
| - API: GET /related?dataset=<id>, /api/graph, /analysis, /examples, /stats. | |
| DB is opened read-only; lineage.db is baked into the repo (small, robust). | |
| Edward Tufte-inspired styling. | |
| """ | |
| import json | |
| import sqlite3 | |
| from functools import lru_cache | |
| from pathlib import Path | |
| from fastapi import FastAPI, Query, Request | |
| from fastapi.middleware.gzip import GZipMiddleware | |
| from fastapi.responses import HTMLResponse, JSONResponse | |
| # Prefer the bucket-mounted DB (/data/lineage.db, persistent + can grow); | |
| # fall back to the baked-in copy if the mount isn't present (cold-mount fail, | |
| # local dev, etc.) so the Space stays up either way. | |
| BUCKET_DB = Path("/data/lineage.db") | |
| LOCAL_DB = Path(__file__).parent / "lineage.db" | |
| DB = BUCKET_DB if BUCKET_DB.exists() else LOCAL_DB | |
| print(f"using DB: {DB} (bucket-mount={BUCKET_DB.exists()})", flush=True) | |
| # High safety ceiling only — stops a depth-2 hub exploding to tens of thousands | |
| # of nodes. Normal hubs (incl. FLAN's ~1.4k children) render fully. | |
| MAX_NODES = 2500 | |
| app = FastAPI(title="Dataset Lineage Explorer") | |
| app.add_middleware(GZipMiddleware, minimum_size=500) | |
| _con = sqlite3.connect(f"file:{DB}?mode=ro", uri=True, check_same_thread=False) | |
| _con.row_factory = sqlite3.Row | |
| async def cache_headers(request: Request, call_next): | |
| resp = await call_next(request) | |
| if request.url.path.startswith(("/api", "/related", "/examples", "/stats", "/analysis")): | |
| resp.headers["Cache-Control"] = "public, max-age=86400" | |
| return resp | |
| def _meta(dataset: str) -> dict: | |
| r = _con.execute("SELECT * FROM nodes WHERE dataset_id=?", (dataset,)).fetchone() | |
| return dict(r) if r else {"dataset_id": dataset, "downloads": 0, "likes": 0, "author": ""} | |
| def parents(dataset: str) -> list[dict]: | |
| rows = _con.execute( | |
| "SELECT parent AS id, primary_type, confidence, tags, size_ratio, source " | |
| "FROM edges WHERE child=? ORDER BY confidence DESC", (dataset,) | |
| ).fetchall() | |
| return [dict(r) for r in rows] | |
| def children(dataset: str) -> list[dict]: | |
| rows = _con.execute( | |
| "SELECT child AS id, primary_type, confidence, tags, size_ratio, source " | |
| "FROM edges WHERE parent=? ORDER BY confidence DESC", (dataset,) | |
| ).fetchall() | |
| return [dict(r) for r in rows] | |
| def siblings(dataset: str) -> list[dict]: | |
| rows = _con.execute( | |
| "SELECT DISTINCT e2.child AS id, e2.parent AS via, e2.primary_type " | |
| "FROM edges e1 JOIN edges e2 ON e1.parent=e2.parent " | |
| "WHERE e1.child=? AND e2.child!=? LIMIT 80", (dataset, dataset) | |
| ).fetchall() | |
| return [dict(r) for r in rows] | |
| def related(dataset: str) -> dict: | |
| return {"dataset": dataset, "meta": _meta(dataset), "parents": parents(dataset), | |
| "children": children(dataset), "siblings": siblings(dataset)} | |
| def related_endpoint(dataset: str = Query(..., description="dataset id, e.g. tatsu-lab/alpaca")): | |
| """JSON of parents/children/siblings — for metadata workflows.""" | |
| return JSONResponse(related(dataset)) | |
| def graph(dataset: str, depth: int = 2): | |
| seen_edges, nodes = [], set() | |
| frontier_up, frontier_down = {dataset}, {dataset} | |
| nodes.add(dataset) | |
| for _ in range(max(1, min(depth, 4))): | |
| nxt = set() | |
| for d in frontier_up: | |
| for p in parents(d): | |
| seen_edges.append((p["id"], d, p["primary_type"], p["confidence"], p["source"])) | |
| if p["id"] not in nodes and len(nodes) < MAX_NODES: | |
| nodes.add(p["id"]) | |
| nxt.add(p["id"]) | |
| frontier_up = nxt | |
| if len(nodes) >= MAX_NODES: | |
| break | |
| for _ in range(max(1, min(depth, 4))): | |
| nxt = set() | |
| for d in frontier_down: | |
| for c in children(d): | |
| seen_edges.append((d, c["id"], c["primary_type"], c["confidence"], c["source"])) | |
| if c["id"] not in nodes and len(nodes) < MAX_NODES: | |
| nodes.add(c["id"]) | |
| nxt.add(c["id"]) | |
| frontier_down = nxt | |
| if len(nodes) >= MAX_NODES: | |
| break | |
| edges_seen = {(a, b): (t, cf, s) for a, b, t, cf, s in seen_edges} | |
| node_list = [{"id": n, "label": n.split("/")[-1], "title": f"{n} ({_meta(n)['downloads']:,} dl)", | |
| "downloads": _meta(n)["downloads"], "focus": n == dataset} for n in nodes] | |
| edge_list = [{"from": a, "to": b, "type": t, "confidence": cf, "source": s} | |
| for (a, b), (t, cf, s) in edges_seen.items() if a in nodes and b in nodes] | |
| return {"nodes": node_list, "edges": edge_list} | |
| def _stats(): | |
| n = _con.execute("SELECT count(*) FROM nodes").fetchone()[0] | |
| e = _con.execute("SELECT count(*) FROM edges").fetchone()[0] | |
| inf = _con.execute("SELECT count(*) FROM edges WHERE source='inferred'").fetchone()[0] | |
| deriv = _con.execute("SELECT count(DISTINCT child) FROM edges WHERE source='inferred'").fetchone()[0] | |
| return {"nodes": n, "edges": e, "inferred": inf, "derivatives": deriv} | |
| def _examples(n: int): | |
| rows = _con.execute( | |
| "SELECT parent AS id, count(*) AS derivatives FROM edges " | |
| "WHERE source='inferred' GROUP BY parent ORDER BY derivatives DESC LIMIT ?", (n,) | |
| ).fetchall() | |
| return [dict(r) for r in rows] | |
| def _analysis(): | |
| def top(sql, *a): | |
| return [dict(r) for r in _con.execute(sql, a).fetchall()] | |
| return { | |
| "type_dist": top("SELECT primary_type t, count(*) n FROM edges WHERE source='inferred' " | |
| "GROUP BY t ORDER BY n DESC"), | |
| "most_copied": top("SELECT parent id, count(*) n FROM edges WHERE primary_type='exact_copy' " | |
| "GROUP BY parent ORDER BY n DESC LIMIT 12"), | |
| "most_derived": top("SELECT parent id, count(*) n FROM edges WHERE source='inferred' " | |
| "GROUP BY parent ORDER BY n DESC LIMIT 12"), | |
| "most_translated": top("SELECT parent id, count(*) n FROM edges WHERE primary_type='translation' " | |
| "GROUP BY parent ORDER BY n DESC LIMIT 10"), | |
| "top_orgs": top("SELECT substr(child,1,instr(child,'/')-1) id, count(*) n FROM edges " | |
| "WHERE source='inferred' AND instr(child,'/')>0 GROUP BY id ORDER BY n DESC LIMIT 12"), | |
| } | |
| def _map(): | |
| """Overview map: multi-level chains (a derivative that is itself derived-from) | |
| + the top fan-out hubs. Nodes sized by derivative count.""" | |
| counts = dict(_con.execute( | |
| "SELECT parent, count(*) FROM edges WHERE source='inferred' GROUP BY parent" | |
| ).fetchall()) | |
| backbone = _con.execute( | |
| "SELECT parent, child, primary_type, confidence FROM edges WHERE source='inferred' " | |
| "AND child IN (SELECT DISTINCT parent FROM edges WHERE source='inferred')" | |
| ).fetchall() | |
| nodeset, edges = set(), [] | |
| for r in backbone: | |
| nodeset.add(r["parent"]) | |
| nodeset.add(r["child"]) | |
| edges.append({"from": r["parent"], "to": r["child"], | |
| "type": r["primary_type"], "confidence": r["confidence"]}) | |
| for pid, _n in sorted(counts.items(), key=lambda x: -x[1])[:60]: | |
| nodeset.add(pid) | |
| nodes = [{"id": n, "label": n.split("/")[-1], "derivatives": counts.get(n, 0), | |
| "downloads": _meta(n)["downloads"]} for n in nodeset] | |
| return {"nodes": nodes, "edges": edges} | |
| def api_map(): | |
| return _map() | |
| def stats(): | |
| return _stats() | |
| def examples(n: int = 24): | |
| return _examples(n) | |
| def analysis(): | |
| """Aggregate analysis of the inferred lineage graph.""" | |
| return _analysis() | |
| def index(): | |
| return (INDEX_HTML | |
| .replace("__STATS__", json.dumps(_stats())) | |
| .replace("__EXAMPLES__", json.dumps(_examples(24))) | |
| .replace("__ANALYSIS__", json.dumps(_analysis()))) | |
| INDEX_HTML = """<!doctype html><html><head><meta charset="utf-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1"> | |
| <title>Dataset Lineage Explorer</title> | |
| <script src="https://unpkg.com/vis-network/standalone/umd/vis-network.min.js"></script> | |
| <style> | |
| :root{--paper:#fffff8;--ink:#111;--muted:#6f6f66;--rule:#dcdccf;--accent:#c1440e;--bar:#43586b} | |
| *{box-sizing:border-box} | |
| body{margin:0;background:var(--paper);color:var(--ink); | |
| font-family:Palatino,"Palatino Linotype","Book Antiqua",Georgia,serif;font-size:16px;line-height:1.5} | |
| header{padding:22px 34px 14px;border-bottom:1px solid var(--rule)} | |
| h1{font-weight:400;font-size:23px;margin:0;display:inline} | |
| .sub{font-style:italic;color:var(--muted);font-size:14px;margin:3px 0 12px} | |
| nav{display:flex;gap:18px;align-items:flex-end;flex-wrap:wrap} | |
| nav .tab{font-size:15px;cursor:pointer;border-bottom:1px solid transparent;padding-bottom:2px} | |
| nav .tab.on{border-color:var(--ink)} | |
| .controls{display:flex;gap:18px;align-items:flex-end;margin-left:auto;flex-wrap:wrap} | |
| label{display:block;font-size:11px;color:var(--muted);letter-spacing:.05em;margin-bottom:2px} | |
| input,select{font-family:inherit;font-size:15px;background:var(--paper);border:0; | |
| border-bottom:1px solid var(--ink);color:var(--ink);padding:3px 2px} | |
| input{width:250px} | |
| button{font-family:inherit;font-size:15px;background:none;border:0;color:var(--accent); | |
| border-bottom:1px solid var(--accent);cursor:pointer;padding:0 0 2px} | |
| #overview{padding:26px 40px 60px;overflow:auto;height:calc(100vh - 132px)} | |
| .grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(360px,1fr));gap:34px 50px;max-width:1180px} | |
| .fig h3{font-weight:400;font-size:13px;letter-spacing:.09em;text-transform:uppercase;color:var(--muted); | |
| margin:0 0 3px;border-bottom:1px solid var(--rule);padding-bottom:3px} | |
| .cap{font-style:italic;color:var(--muted);font-size:12.5px;margin:0 0 9px} | |
| table.bars{width:100%;border-collapse:collapse;font-size:14px} | |
| table.bars td{padding:2.5px 6px;vertical-align:middle} | |
| td.lbl{white-space:nowrap;max-width:230px;overflow:hidden;text-overflow:ellipsis} | |
| td.bar{width:46%} | |
| td.bar span{display:block;height:11px;background:var(--bar);opacity:.85} | |
| td.num{text-align:right;color:var(--muted);font-variant-numeric:tabular-nums;width:48px} | |
| #wrap{display:flex;height:calc(100vh - 132px);position:relative} | |
| #net{flex:2.3;min-width:0} | |
| #legend{position:absolute;left:14px;bottom:14px;background:rgba(255,255,248,.94); | |
| border:1px solid var(--rule);padding:8px 11px;font-size:11px;line-height:1.75;max-width:210px; | |
| box-shadow:0 1px 3px rgba(0,0,0,.06);user-select:none} | |
| #legend .lt{white-space:nowrap;cursor:pointer;padding:1px 4px;margin:0 -4px;border-radius:3px} | |
| #legend .lt:hover{background:#f1f1e6} | |
| #legend .lt.off{opacity:.35;text-decoration:line-through} | |
| #legend .reset{margin-top:5px;padding-top:4px;border-top:1px solid var(--rule); | |
| font-style:italic;color:var(--muted);cursor:pointer} | |
| #legend .reset:hover{color:var(--ink)} | |
| @media(max-width:760px){#legend{display:none}} | |
| #side{flex:1;max-width:440px;overflow:auto;padding:22px 28px;border-left:1px solid var(--rule)} | |
| #side h2{font-weight:400;font-size:19px;margin:0;word-break:break-all} | |
| .meta{color:var(--muted);font-style:italic;font-size:13px;margin:4px 0} | |
| #side h3{font-weight:400;font-size:12px;letter-spacing:.1em;text-transform:uppercase;color:var(--muted); | |
| margin:22px 0 7px;border-bottom:1px solid var(--rule);padding-bottom:3px} | |
| .row{padding:4px 0;font-size:15px} | |
| .dot{display:inline-block;width:8px;height:8px;border-radius:50%;margin-right:8px;vertical-align:middle} | |
| .ty{font-style:italic;color:var(--muted);font-size:12.5px;margin-left:7px} | |
| a{color:var(--ink);text-decoration:none;border-bottom:1px solid var(--rule);cursor:pointer} | |
| a:hover{border-color:var(--ink)} | |
| .meta a{border:0;color:var(--accent)} | |
| a.hub{border:0;color:var(--accent);font-size:11px;margin-left:4px;padding:0 3px;opacity:.7} | |
| a.hub:hover{opacity:1} | |
| code{font-family:ui-monospace,monospace;font-size:12.5px;background:#f1f1e6;padding:1px 5px} | |
| .placeholder{color:var(--muted);font-style:italic} | |
| </style></head><body> | |
| <header> | |
| <h1>Dataset Lineage Explorer</h1> | |
| <div class="sub" id="sub"></div> | |
| <nav> | |
| <span class="tab on" id="tabOv" onclick="showOverview()">Overview</span> | |
| <span class="tab" id="tabEx" onclick="showExplorer()">Explorer</span> | |
| <span class="controls"> | |
| <span><label>dataset</label><input id="q" value="tatsu-lab/alpaca" spellcheck="false" | |
| onkeydown="if(event.key==='Enter')explore()"></span> | |
| <span><label>or a much-derived dataset</label> | |
| <select id="ex"><option value="">— examples —</option></select></span> | |
| <button onclick="explore()">explore →</button> | |
| </span> | |
| </nav> | |
| </header> | |
| <div id="overview"></div> | |
| <div id="wrap" style="display:none"><div id="net"></div><div id="legend"></div><div id="side"></div></div> | |
| <script> | |
| const STATS=__STATS__, EXAMPLES=__EXAMPLES__, ANALYSIS=__ANALYSIS__; | |
| document.getElementById('sub').textContent = | |
| STATS.nodes.toLocaleString()+' datasets · '+STATS.inferred.toLocaleString()+ | |
| ' inferred lineage edges · '+STATS.derivatives.toLocaleString()+' datasets are derivatives — all recovered from content'; | |
| const exSel=document.getElementById('ex'); | |
| for(const e of EXAMPLES){const o=document.createElement('option');o.value=e.id; | |
| o.textContent=e.id+' ('+e.derivatives+' derivatives)';exSel.appendChild(o);} | |
| exSel.onchange=()=>{if(exSel.value){document.getElementById('q').value=exSel.value;explore();}}; | |
| const COLORS={exact_copy:'#8a8a80',filtered_subset:'#4a6b8a',subset_modified:'#5a7a9a', | |
| subset_reformatted:'#39597a',cleaned:'#6a8a5a',regenerated_variant:'#b08a3e', | |
| modified_variant:'#9a8a6a',augmentation:'#7a6a9a',augmentation_modified:'#8a7aaa', | |
| translation:'#a8566a',reformat:'#4a8a8a',partial_overlap:'#aaaaa0',declared:'#c8c8b8'}; | |
| // corner colour key — clickable filter (toggle type visibility) | |
| const LEGEND_TYPES=['exact_copy','filtered_subset','subset_modified','subset_reformatted','cleaned', | |
| 'regenerated_variant','augmentation','augmentation_modified','translation','reformat','modified_variant', | |
| 'partial_overlap','combined','declared']; | |
| const ACTIVE_TYPES=new Set(LEGEND_TYPES); | |
| let currentNetwork=null; | |
| (function(){let h='<div style="font-style:italic;color:var(--muted);margin-bottom:3px">relationship · click to narrow</div>'; | |
| for(const t of LEGEND_TYPES) | |
| h+='<div class="lt" data-type="'+t+'"><span class="dot" style="background:'+(COLORS[t]||'#999')+'"></span>'+ | |
| t.replace(/_/g,' ')+(t==='declared'?' (dashed)':'')+'</div>'; | |
| h+='<div class="reset" data-reset="1">show all</div>'; | |
| document.getElementById('legend').innerHTML=h;})(); | |
| let filterMode=false; // false=show-all default; flips after first click | |
| function applyFilter(){ | |
| if(!currentNetwork)return; | |
| const edgesDS=currentNetwork.body.data.edges, nodesDS=currentNetwork.body.data.nodes; | |
| const allEdges=edgesDS.get(); | |
| edgesDS.update(allEdges.map(e=>({id:e.id, hidden:!ACTIVE_TYPES.has(e.__type)}))); | |
| // hide nodes that have NO visible incident edge (except the focus) | |
| const visible=new Set(); | |
| for(const e of allEdges) if(ACTIVE_TYPES.has(e.__type)){visible.add(e.from);visible.add(e.to);} | |
| nodesDS.update(nodesDS.get().map(n=>({id:n.id, hidden:!visible.has(n.id) && !n.__focus}))); | |
| } | |
| document.getElementById('legend').addEventListener('click',ev=>{ | |
| const reset=ev.target.closest('[data-reset]'); | |
| if(reset){LEGEND_TYPES.forEach(t=>ACTIVE_TYPES.add(t)); | |
| document.querySelectorAll('#legend .lt').forEach(el=>el.classList.remove('off')); | |
| filterMode=false;applyFilter();return;} | |
| const lt=ev.target.closest('.lt'); | |
| if(!lt||!lt.dataset.type)return; | |
| const t=lt.dataset.type; | |
| if(!filterMode){ | |
| // first click → narrow to ONLY this type; subsequent clicks add more | |
| ACTIVE_TYPES.clear();ACTIVE_TYPES.add(t); | |
| document.querySelectorAll('#legend .lt').forEach(el=>el.classList.toggle('off', el.dataset.type!==t)); | |
| filterMode=true; | |
| } else if(ACTIVE_TYPES.has(t)){ | |
| // already in view → remove | |
| ACTIVE_TYPES.delete(t);lt.classList.add('off'); | |
| if(ACTIVE_TYPES.size===0){ | |
| // emptied → return to show-all | |
| LEGEND_TYPES.forEach(x=>ACTIVE_TYPES.add(x)); | |
| document.querySelectorAll('#legend .lt').forEach(el=>el.classList.remove('off')); | |
| filterMode=false; | |
| } | |
| } else { | |
| // add to view | |
| ACTIVE_TYPES.add(t);lt.classList.remove('off'); | |
| } | |
| applyFilter(); | |
| }); | |
| // ---------- Overview ---------- | |
| function bars(rows,opts){ | |
| opts=opts||{};const max=Math.max(...rows.map(r=>r.n),1);let h='<table class="bars">'; | |
| for(const r of rows){const w=Math.round(100*r.n/max); | |
| let lbl; | |
| if(opts.type){ | |
| lbl='<span class="dot" style="background:'+(COLORS[r.t]||'#999')+'"></span>'+r.t.replace(/_/g,' '); | |
| } else { | |
| const isDataset=r.id.includes('/'); | |
| const hubUrl='https://huggingface.co/'+(isDataset?'datasets/':'')+r.id; | |
| lbl=(isDataset?'<a data-ds="'+r.id+'">'+r.id+'</a>':'<span>'+r.id+'</span>')+ | |
| '<a class="hub" href="'+hubUrl+'" target="_blank" rel="noopener" title="open on HF Hub">↗</a>'; | |
| } | |
| const col=opts.type?(COLORS[r.t]||'#999'):'var(--bar)'; | |
| h+='<tr><td class="lbl">'+lbl+'</td><td class="bar"><span style="width:'+w+'%;background:'+col+'"></span></td>' | |
| +'<td class="num">'+r.n.toLocaleString()+'</td></tr>';} | |
| return h+'</table>'; | |
| } | |
| function fig(title,cap,body){return '<div class="fig"><h3>'+title+'</h3><div class="cap">'+cap+'</div>'+body+'</div>';} | |
| function showOverview(){ | |
| document.getElementById('wrap').style.display='none'; | |
| const ov=document.getElementById('overview');ov.style.display='block'; | |
| document.getElementById('tabOv').classList.add('on');document.getElementById('tabEx').classList.remove('on'); | |
| if(ov.dataset.done)return; | |
| ov.innerHTML='<div class="grid">'+ | |
| fig('Relationship types','How '+STATS.inferred.toLocaleString()+' undeclared derivatives were made.',bars(ANALYSIS.type_dist,{type:1}))+ | |
| fig('Most re-uploaded','Datasets with the most exact copies — redundant re-uploads on the Hub.',bars(ANALYSIS.most_copied))+ | |
| fig('Most-derived datasets','The source datasets the community builds on most.',bars(ANALYSIS.most_derived))+ | |
| fig('Most-translated','Datasets ported to the most other languages.',bars(ANALYSIS.most_translated))+ | |
| fig('Most prolific producers','Orgs/users that publish the most derivative datasets.',bars(ANALYSIS.top_orgs))+ | |
| '</div><p class="cap" style="margin-top:26px;max-width:760px">Figures are over the analysed sample of high-popularity source datasets, not the whole Hub. Click any dataset to open it in the Explorer.</p>'; | |
| ov.dataset.done='1'; | |
| } | |
| // ---------- Explorer ---------- | |
| let network; | |
| function dot(t){return '<span class="dot" style="background:'+(COLORS[t]||'#999')+'"></span>';} | |
| function explore(){const ds=document.getElementById('q').value.trim();if(ds)showExplorer(ds);} | |
| function showExplorer(ds){ | |
| document.getElementById('overview').style.display='none'; | |
| document.getElementById('wrap').style.display='flex'; | |
| document.getElementById('tabEx').classList.add('on');document.getElementById('tabOv').classList.remove('on'); | |
| if(ds){document.getElementById('q').value=ds;go();} | |
| else{map();} // default Explorer = overall map, zoom in by clicking | |
| } | |
| async function map(){ | |
| const g=await fetch('/api/map').then(x=>x.json()); | |
| const nodes=g.nodes.map(n=>({id:n.id,label:(n.derivatives>=8)?n.label:undefined, | |
| title:n.id+' · '+n.derivatives+' derivatives · '+(n.downloads||0).toLocaleString()+' downloads', | |
| shape:'dot',size:6+Math.sqrt(n.derivatives||1)*2.4, | |
| color:{background:'#43586b',border:'#43586b'}, | |
| font:{color:'#111',face:'Palatino, Georgia, serif',size:13,strokeWidth:4,strokeColor:'#fffff8'}})); | |
| const edges=g.edges.map(e=>({from:e.from,to:e.to,__type:e.type,arrows:{to:{scaleFactor:.4}}, | |
| color:{color:COLORS[e.type]||'#999',opacity:.9},width:1.6,title:e.type+' · '+e.confidence})); | |
| network=new vis.Network(document.getElementById('net'),{nodes,edges}, | |
| {physics:{stabilization:{iterations:160,updateInterval:50}, | |
| barnesHut:{gravitationalConstant:-14000,springLength:95,centralGravity:.12,avoidOverlap:.2}}, | |
| nodes:{borderWidth:0},edges:{smooth:false},interaction:{hover:true,hideEdgesOnDrag:true,tooltipDelay:120}}); | |
| network.on('stabilizationIterationsDone',()=>network.setOptions({physics:false})); | |
| network.on('click',p=>{if(p.nodes.length)showExplorer(p.nodes[0]);}); | |
| currentNetwork=network;applyFilter(); | |
| document.getElementById('side').innerHTML='<h2>Lineage map</h2><div class="meta">'+ | |
| g.nodes.length.toLocaleString()+' source datasets · '+g.edges.length+' multi-level chain links</div>'+ | |
| '<p class="row">Node <b>size</b> = number of derivatives. Edge <b>colour</b> = relationship type (key, lower-left). '+ | |
| 'Large dots are heavily-reused datasets; connected runs are multi-step lineages. '+ | |
| '<b>Click any node</b> to open its full lineage.</p>'; | |
| } | |
| async function go(){ | |
| const ds=document.getElementById('q').value.trim();if(!ds)return; | |
| const [g,r]=await Promise.all([ | |
| fetch('/api/graph?dataset='+encodeURIComponent(ds)+'&depth=2').then(x=>x.json()), | |
| fetch('/related?dataset='+encodeURIComponent(ds)).then(x=>x.json())]); | |
| const top=new Set(g.nodes.slice().sort((a,b)=>(b.downloads||0)-(a.downloads||0)).slice(0,40).map(n=>n.id)); | |
| const big=g.nodes.length>250; | |
| const nodes=g.nodes.map(n=>({id:n.id,__focus:n.focus,label:(n.focus||top.has(n.id)||!big)?n.label:undefined,title:n.title,shape:'dot', | |
| size:n.focus?16:(6+Math.log10((n.downloads||0)+10)*2.2), | |
| color:{background:n.focus?'#c1440e':'#43586b',border:n.focus?'#c1440e':'#43586b'}, | |
| font:{color:'#111',face:'Palatino, Georgia, serif',size:n.focus?16:12,strokeWidth:4,strokeColor:'#fffff8'}})); | |
| const edges=g.edges.map(e=>({from:e.from,to:e.to,__type:e.type,arrows:{to:{scaleFactor:.45}}, | |
| color:{color:COLORS[e.type]||'#999',opacity:.8},width:e.source==='inferred'?1.2:.6, | |
| dashes:e.source==='declared',title:e.type+' · conf '+e.confidence+' · '+e.source})); | |
| // adaptive layout: hub-shaped (many children of focus) -> force (spreads radially); | |
| // chain-shaped (multi-level) -> hierarchical LR (reads as a tree). | |
| const fanout=g.edges.filter(e=>e.from===ds||e.to===ds).length; | |
| const useForce=fanout>25||g.nodes.length>180; | |
| const opts=useForce?{ | |
| physics:{stabilization:{iterations:200,updateInterval:50}, | |
| barnesHut:{gravitationalConstant:-12000,springLength:90,centralGravity:.05, | |
| avoidOverlap:.35,damping:.4}}, | |
| nodes:{borderWidth:0},edges:{smooth:false,width:.6,color:{opacity:.55}}, | |
| interaction:{hover:!big,tooltipDelay:120,hideEdgesOnDrag:true}}:{ | |
| layout:{hierarchical:{enabled:true,direction:'LR',sortMethod:'directed', | |
| levelSeparation:230,nodeSpacing:16,treeSpacing:36,blockShifting:true,edgeMinimization:true}}, | |
| physics:false,nodes:{borderWidth:0},edges:{smooth:false}, | |
| interaction:{hover:!big,tooltipDelay:120,hideEdgesOnDrag:true}}; | |
| network=new vis.Network(document.getElementById('net'),{nodes,edges},opts); | |
| if(useForce)network.on('stabilizationIterationsDone',()=>network.setOptions({physics:false})); | |
| network.on('click',p=>{if(p.nodes.length){showExplorer(p.nodes[0]);}}); | |
| currentNetwork=network;applyFilter(); | |
| renderSide(r); | |
| } | |
| function lst(title,arr){ | |
| if(!arr||!arr.length)return ''; | |
| let h='<h3>'+title+' · '+arr.length+'</h3>'; | |
| for(const x of arr){const t=x.primary_type||''; | |
| h+='<div class="row">'+dot(t)+'<a data-ds="'+x.id+'">'+x.id+'</a>'+ | |
| '<a class="hub" href="https://huggingface.co/datasets/'+x.id+'" target="_blank" rel="noopener" title="open on HF Hub">↗</a>'+ | |
| (t?'<span class="ty">'+t.replace(/_/g,' ')+(x.size_ratio?' ×'+x.size_ratio:'')+'</span>':'')+'</div>';} | |
| return h; | |
| } | |
| function renderSide(r){ | |
| const m=r.meta||{}; | |
| let h='<h2>'+r.dataset+'</h2><div class="meta">'+(m.downloads||0).toLocaleString()+' downloads'+ | |
| (m.author?' · '+m.author:'')+' · <a href="https://huggingface.co/datasets/'+r.dataset+'" target="_blank">Hub ↗</a></div>'; | |
| h+=lst('Derived from — parents',r.parents); | |
| h+=lst('Derivatives — children',r.children); | |
| h+=lst('Siblings — share a parent',r.siblings); | |
| if(!r.parents.length&&!r.children.length&&!r.siblings.length) | |
| h+='<p class="placeholder">No lineage recorded for this dataset in the current sample.</p>'; | |
| h+='<h3>query API</h3><div class="row"><code>/related?dataset='+r.dataset+'</code></div>'; | |
| document.getElementById('side').innerHTML=h; | |
| } | |
| // any dataset link anywhere -> open in explorer | |
| document.addEventListener('click',e=>{const a=e.target.closest('a[data-ds]'); | |
| if(a){e.preventDefault();showExplorer(a.getAttribute('data-ds'));}}); | |
| showOverview(); | |
| </script></body></html>""" | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |