| <!DOCTYPE html><html lang="en" data-theme="light" data-toc-auto-collapse="1"> <head><meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1"><title>Maintain the unmaintainable: |
| 1M python loc, 400+ models</title><meta name="description" content="A peek into software engineering for the transformers library"><link rel="canonical" href="http://localhost:4321/"><meta property="og:type" content="article"><meta property="og:title" content="Maintain the unmaintainable: |
| 1M python loc, 400+ models"><meta property="og:description" content="A peek into software engineering for the transformers library"><meta property="og:url" content="http://localhost:4321/"><meta property="og:image" content="/thumb.auto.jpg"><meta property="article:published_time" content="October 2, 2025"><meta property="article:author" content="Pablo Montalvo"><meta property="article:author" content="Lysandre Debut"><meta property="article:author" content="Pedro Cuenca"><meta property="article:author" content="Yoni Gozlan"><meta name="twitter:card" content="summary_large_image"><meta name="twitter:title" content="Maintain the unmaintainable: |
| 1M python loc, 400+ models"><meta name="twitter:description" content="A peek into software engineering for the transformers library"><meta name="twitter:image" content="/thumb.auto.jpg"><script type="application/ld+json">{"@context":"https://schema.org","@type":"Article","headline":"Maintain the unmaintainable:\n1M python loc, 400+ models","description":"A peek into software engineering for the transformers library","datePublished":"October 2, 2025","author":[{"@type":"Person","name":"Pablo Montalvo"},{"@type":"Person","name":"Lysandre Debut"},{"@type":"Person","name":"Pedro Cuenca"},{"@type":"Person","name":"Yoni Gozlan"}],"keywords":"transformers, engineering, design-philosophy","mainEntityOfPage":"http://localhost:4321/","image":["/thumb.auto.jpg"]}</script><script> |
| (() => { |
| try { |
| const saved = localStorage.getItem("theme"); |
| const prefersDark = |
| window.matchMedia && |
| window.matchMedia("(prefers-color-scheme: dark)").matches; |
| const theme = saved || (prefersDark ? "dark" : "light"); |
| document.documentElement.setAttribute("data-theme", theme); |
| } catch {} |
| })(); |
| </script><script type="module" src="/scripts/color-palettes.js"></script><script src="https://cdn.plot.ly/plotly-3.0.0.min.js" charset="utf-8"></script><link rel="stylesheet" href="/_astro/index.C8LanvBP.css"><script type="module" src="/_astro/hoisted.DK-CdsVg.js"></script> |
| <script type="module" src="/_astro/page.CH0W_C1Z.js"></script></head> <body> <button id="theme-toggle" aria-label="Toggle color theme" data-astro-cid-x3pjskd3> <svg class="icon light" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <circle cx="12" cy="12" r="5" data-astro-cid-x3pjskd3></circle> <line x1="12" y1="1" x2="12" y2="4" data-astro-cid-x3pjskd3></line> <line x1="12" y1="20" x2="12" y2="23" data-astro-cid-x3pjskd3></line> <line x1="1" y1="12" x2="4" y2="12" data-astro-cid-x3pjskd3></line> <line x1="20" y1="12" x2="23" y2="12" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="4.22" x2="6.34" y2="6.34" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="17.66" x2="19.78" y2="19.78" data-astro-cid-x3pjskd3></line> <line x1="4.22" y1="19.78" x2="6.34" y2="17.66" data-astro-cid-x3pjskd3></line> <line x1="17.66" y1="6.34" x2="19.78" y2="4.22" data-astro-cid-x3pjskd3></line> </svg> <svg class="icon dark" width="20" height="20" viewBox="0 0 24 24" aria-hidden="true" focusable="false" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" data-astro-cid-x3pjskd3> <path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z" data-astro-cid-x3pjskd3></path> </svg> </button> <section class="hero" data-astro-cid-bbe6dxrz> <h1 class="hero-title" data-astro-cid-bbe6dxrz>Maintain the unmaintainable:<br/>1M python loc, 400+ models</h1> <div class="hero-banner" data-astro-cid-bbe6dxrz> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-09532bx2xlzs"><style> |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@500;600&display=swap'); |
| |
| .banner-container { |
| width: 100%; |
| height: 400px; |
| position: relative; |
| overflow: visible; |
| margin: 0 auto; |
| padding: 40px 0; |
| opacity: 0; |
| transition: opacity 0.5s ease-in-out; |
| } |
| |
| .banner-container.loaded { |
| opacity: 1; |
| } |
| |
| #banner-svg { |
| width: 100%; |
| height: 100%; |
| display: block; |
| } |
| |
| .link { stroke: #9aa4b2; stroke-opacity: .25; } |
| .link.cand { stroke: #e63946; stroke-width: 2; stroke-opacity: .5; } |
| |
| .node.base circle { fill: var(--palette-categorical-1, #ffbe0b); } |
| .node.derived circle { fill: var(--palette-categorical-2, #1f77b4); } |
| .node.cand circle { fill: var(--palette-categorical-3, #e63946); } |
| |
| .node-label { |
| fill: var(--text-color, #1f2937); |
| pointer-events: none; |
| text-anchor: middle; |
| font-weight: 600; |
| paint-order: stroke fill; |
| stroke: var(--page-bg, #ffffff); |
| stroke-width: 30px; |
| font-size: 30px; |
| font-family: 'Inter', system-ui, Arial, sans-serif; |
| } |
| |
| [data-theme="dark"] .node-label { |
| fill: #e5e7eb; |
| stroke: #1a202c; |
| stroke-width: 30px; |
| } |
| |
| [data-theme="dark"] .link { |
| stroke: #718096; |
| } |
| |
| .llama-label { |
| stroke-width: 70px!important; |
| font-size: 90px !important; |
| font-weight: 700 !important; |
| } |
| |
| </style> |
|
|
| <div class="banner-container"> |
| <svg id="banner-svg"></svg> |
| </div> |
|
|
| <script src="https://d3js.org/d3.v7.min.js"></script> |
| <script src="/scripts/color-palettes.js"></script> |
| <script> |
| |
| const graph = {"nodes":[{"id":"aimv2","cls":"derived","sz":1.0411522633744856},{"id":"albert","cls":"cand","sz":1.9176954732510287},{"id":"align","cls":"cand","sz":1.9176954732510287},{"id":"altclip","cls":"cand","sz":1.9135802469135803},{"id":"apertus","cls":"derived","sz":1.045267489711934},{"id":"arcee","cls":"derived","sz":1.0246913580246915},{"id":"aria","cls":"derived","sz":1.045267489711934},{"id":"audio_spectrogram_transformer","cls":"cand","sz":1.9176954732510287},{"id":"auto","cls":"base","sz":1.0123456790123457},{"id":"autoformer","cls":"cand","sz":1.9094650205761317},{"id":"aya_vision","cls":"derived","sz":1.0411522633744856},{"id":"bamba","cls":"derived","sz":1.074074074074074},{"id":"bark","cls":"cand","sz":1.9176954732510287},{"id":"bart","cls":"base","sz":1.9588477366255144},{"id":"beit","cls":"base","sz":1.9176954732510287},{"id":"bert","cls":"base","sz":2.02880658436214},{"id":"bert_generation","cls":"cand","sz":1.9176954732510287},{"id":"big_bird","cls":"cand","sz":1.9094650205761317},{"id":"bigbird_pegasus","cls":"base","sz":1.905349794238683},{"id":"biogpt","cls":"derived","sz":1.0164609053497942},{"id":"bit","cls":"cand","sz":1.9176954732510287},{"id":"bitnet","cls":"derived","sz":1.0329218106995885},{"id":"blenderbot","cls":"cand","sz":1.9176954732510287},{"id":"blenderbot_small","cls":"cand","sz":1.9176954732510287},{"id":"blip","cls":"cand","sz":1.9176954732510287},{"id":"blip_2","cls":"base","sz":1.9176954732510287},{"id":"bloom","cls":"cand","sz":1.9176954732510287},{"id":"blt","cls":"derived","sz":1.0411522633744856},{"id":"bridgetower","cls":"cand","sz":1.9094650205761317},{"id":"bros","cls":"cand","sz":1.9176954732510287},{"id":"camembert","cls":"derived","sz":1.0329218106995885},{"id":"canine","cls":"cand","sz":1.9135802469135803},{"id":"chameleon","cls":"base","sz":1.9423868312757202},{"id":"chinese_clip","cls":"cand","sz":1.9176954732510287},{"id":"clap","cls":"cand","sz":1.9094650205761317},{"id":"clip","cls":"base","sz":2.016460905349794},{"id":"clipseg","cls":"cand","sz":1.9176954732510287},{"id":"clvp","cls":"cand","sz":1.9135802469135803},{"id":"codegen","cls":"cand","sz":1.9176954732510287},{"id":"cohere","cls":"derived","sz":1.0658436213991769},{"id":"cohere2","cls":"derived","sz":1.045267489711934},{"id":"cohere2_vision","cls":"derived","sz":1.0164609053497942},{"id":"colpali","cls":"base","sz":1.008230452674897},{"id":"colqwen2","cls":"derived","sz":1.008230452674897},{"id":"convbert","cls":"cand","sz":1.9176954732510287},{"id":"convnext","cls":"cand","sz":1.9176954732510287},{"id":"convnextv2","cls":"cand","sz":1.9176954732510287},{"id":"cpmant","cls":"cand","sz":1.9176954732510287},{"id":"csm","cls":"derived","sz":1.0329218106995885},{"id":"ctrl","cls":"cand","sz":1.9176954732510287},{"id":"cvt","cls":"cand","sz":1.9176954732510287},{"id":"d_fine","cls":"derived","sz":1.0534979423868314},{"id":"dab_detr","cls":"cand","sz":1.9135802469135803},{"id":"dac","cls":"cand","sz":1.9135802469135803},{"id":"data2vec","cls":"derived","sz":1.074074074074074},{"id":"dbrx","cls":"cand","sz":1.9176954732510287},{"id":"deberta","cls":"cand","sz":1.9176954732510287},{"id":"deberta_v2","cls":"cand","sz":1.9176954732510287},{"id":"decision_transformer","cls":"cand","sz":1.9176954732510287},{"id":"deepseek_v2","cls":"derived","sz":1.037037037037037},{"id":"deepseek_v3","cls":"derived","sz":1.131687242798354},{"id":"deepseek_vl","cls":"derived","sz":1.0329218106995885},{"id":"deepseek_vl_hybrid","cls":"derived","sz":1.02880658436214},{"id":"deit","cls":"cand","sz":1.9176954732510287},{"id":"deprecated","cls":"cand","sz":1.8888888888888888},{"id":"depth_anything","cls":"base","sz":1.9423868312757202},{"id":"depth_pro","cls":"cand","sz":1.9135802469135803},{"id":"detr","cls":"cand","sz":1.9135802469135803},{"id":"dia","cls":"derived","sz":1.0205761316872428},{"id":"diffllama","cls":"derived","sz":1.0411522633744856},{"id":"dinat","cls":"cand","sz":1.9176954732510287},{"id":"dinov2","cls":"base","sz":1.97119341563786},{"id":"dinov2_with_registers","cls":"derived","sz":1.0246913580246915},{"id":"dinov3_convnext","cls":"cand","sz":1.9135802469135803},{"id":"dinov3_vit","cls":"derived","sz":1.0329218106995885},{"id":"distilbert","cls":"cand","sz":1.9176954732510287},{"id":"doge","cls":"derived","sz":1.0411522633744856},{"id":"donut","cls":"cand","sz":1.9135802469135803},{"id":"dots1","cls":"derived","sz":1.045267489711934},{"id":"dpr","cls":"cand","sz":1.9094650205761317},{"id":"efficientloftr","cls":"cand","sz":1.9135802469135803},{"id":"efficientnet","cls":"cand","sz":1.9176954732510287},{"id":"electra","cls":"cand","sz":1.9176954732510287},{"id":"emu3","cls":"derived","sz":1.0329218106995885},{"id":"encodec","cls":"cand","sz":1.9135802469135803},{"id":"encoder_decoder","cls":"cand","sz":1.9135802469135803},{"id":"eomt","cls":"derived","sz":1.02880658436214},{"id":"ernie","cls":"derived","sz":1.0699588477366255},{"id":"ernie4_5","cls":"derived","sz":1.0329218106995885},{"id":"ernie4_5_moe","cls":"derived","sz":1.037037037037037},{"id":"esm","cls":"base","sz":1.8806584362139918},{"id":"evolla","cls":"derived","sz":1.0617283950617284},{"id":"exaone4","cls":"derived","sz":1.0493827160493827},{"id":"falcon","cls":"cand","sz":1.9135802469135803},{"id":"falcon_h1","cls":"derived","sz":1.0493827160493827},{"id":"falcon_mamba","cls":"derived","sz":1.0411522633744856},{"id":"fastspeech2_conformer","cls":"cand","sz":1.9094650205761317},{"id":"flaubert","cls":"cand","sz":1.9176954732510287},{"id":"flava","cls":"cand","sz":1.9094650205761317},{"id":"flex_olmo","cls":"derived","sz":1.037037037037037},{"id":"florence2","cls":"derived","sz":1.02880658436214},{"id":"fnet","cls":"cand","sz":1.9176954732510287},{"id":"focalnet","cls":"cand","sz":1.9135802469135803},{"id":"fsmt","cls":"cand","sz":1.9176954732510287},{"id":"funnel","cls":"cand","sz":1.9176954732510287},{"id":"fuyu","cls":"cand","sz":1.9176954732510287},{"id":"gemma","cls":"derived","sz":1.1152263374485596},{"id":"gemma2","cls":"derived","sz":1.1604938271604939},{"id":"gemma3","cls":"derived","sz":1.0864197530864197},{"id":"gemma3n","cls":"derived","sz":1.0617283950617284},{"id":"git","cls":"cand","sz":1.9176954732510287},{"id":"glm","cls":"derived","sz":1.0534979423868314},{"id":"glm4","cls":"derived","sz":1.037037037037037},{"id":"glm4_moe","cls":"derived","sz":1.0658436213991769},{"id":"glm4v","cls":"derived","sz":1.0699588477366255},{"id":"glm4v_moe","cls":"derived","sz":1.0411522633744856},{"id":"glpn","cls":"cand","sz":1.9176954732510287},{"id":"got_ocr2","cls":"derived","sz":1.045267489711934},{"id":"gpt2","cls":"cand","sz":1.9135802469135803},{"id":"gpt_bigcode","cls":"cand","sz":1.9176954732510287},{"id":"gpt_neo","cls":"cand","sz":1.9176954732510287},{"id":"gpt_neox","cls":"derived","sz":1.0205761316872428},{"id":"gpt_neox_japanese","cls":"cand","sz":1.9176954732510287},{"id":"gpt_oss","cls":"derived","sz":1.0411522633744856},{"id":"gptj","cls":"cand","sz":1.9135802469135803},{"id":"granite","cls":"derived","sz":1.0246913580246915},{"id":"granite_speech","cls":"cand","sz":1.9176954732510287},{"id":"granitemoe","cls":"base","sz":1.934156378600823},{"id":"granitemoehybrid","cls":"derived","sz":1.0411522633744856},{"id":"granitemoeshared","cls":"derived","sz":1.045267489711934},{"id":"grounding_dino","cls":"base","sz":1.0411522633744856},{"id":"groupvit","cls":"cand","sz":1.9135802469135803},{"id":"helium","cls":"derived","sz":1.037037037037037},{"id":"hgnet_v2","cls":"derived","sz":1.0041152263374487},{"id":"hiera","cls":"cand","sz":1.9135802469135803},{"id":"hubert","cls":"derived","sz":1.02880658436214},{"id":"hunyuan_v1_dense","cls":"derived","sz":1.0411522633744856},{"id":"hunyuan_v1_moe","cls":"derived","sz":1.0411522633744856},{"id":"ibert","cls":"cand","sz":1.9176954732510287},{"id":"idefics","cls":"base","sz":1.9423868312757202},{"id":"idefics2","cls":"cand","sz":1.9176954732510287},{"id":"idefics3","cls":"base","sz":1.9382716049382716},{"id":"ijepa","cls":"derived","sz":1.0164609053497942},{"id":"imagegpt","cls":"cand","sz":1.9176954732510287},{"id":"informer","cls":"derived","sz":1.0534979423868314},{"id":"instructblip","cls":"base","sz":1.9423868312757202},{"id":"instructblipvideo","cls":"derived","sz":1.0329218106995885},{"id":"internvl","cls":"derived","sz":1.0329218106995885},{"id":"jamba","cls":"base","sz":1.9218106995884774},{"id":"janus","cls":"derived","sz":1.0658436213991769},{"id":"jetmoe","cls":"cand","sz":1.9094650205761317},{"id":"kosmos2","cls":"cand","sz":1.9176954732510287},{"id":"kosmos2_5","cls":"cand","sz":1.9176954732510287},{"id":"kyutai_speech_to_text","cls":"derived","sz":1.0164609053497942},{"id":"layoutlm","cls":"cand","sz":1.9176954732510287},{"id":"layoutlmv2","cls":"cand","sz":1.9135802469135803},{"id":"layoutlmv3","cls":"cand","sz":1.9135802469135803},{"id":"led","cls":"cand","sz":1.9094650205761317},{"id":"levit","cls":"cand","sz":1.9176954732510287},{"id":"lfm2","cls":"derived","sz":1.037037037037037},{"id":"lfm2_vl","cls":"derived","sz":1.0205761316872428},{"id":"lightglue","cls":"derived","sz":1.0205761316872428},{"id":"lilt","cls":"cand","sz":1.9176954732510287},{"id":"llama","cls":"base","sz":3.0},{"id":"llama4","cls":"base","sz":1.9218106995884774},{"id":"llava","cls":"base","sz":2.1069958847736627},{"id":"llava_next","cls":"base","sz":1.954732510288066},{"id":"llava_next_video","cls":"derived","sz":1.0658436213991769},{"id":"llava_onevision","cls":"derived","sz":1.037037037037037},{"id":"longcat_flash","cls":"derived","sz":1.045267489711934},{"id":"longformer","cls":"cand","sz":1.9094650205761317},{"id":"longt5","cls":"cand","sz":1.9094650205761317},{"id":"luke","cls":"cand","sz":1.9176954732510287},{"id":"lxmert","cls":"cand","sz":1.9176954732510287},{"id":"m2m_100","cls":"cand","sz":1.9176954732510287},{"id":"mamba","cls":"base","sz":1.9588477366255144},{"id":"mamba2","cls":"base","sz":1.9629629629629628},{"id":"marian","cls":"cand","sz":1.9176954732510287},{"id":"markuplm","cls":"cand","sz":1.9176954732510287},{"id":"mask2former","cls":"base","sz":1.008230452674897},{"id":"maskformer","cls":"base","sz":1.9135802469135803},{"id":"mbart","cls":"base","sz":1.9218106995884774},{"id":"megatron_bert","cls":"cand","sz":1.9176954732510287},{"id":"metaclip_2","cls":"derived","sz":1.045267489711934},{"id":"mgp_str","cls":"cand","sz":1.9176954732510287},{"id":"mimi","cls":"base","sz":1.9176954732510287},{"id":"minimax","cls":"derived","sz":1.0411522633744856},{"id":"ministral","cls":"derived","sz":1.045267489711934},{"id":"mistral","cls":"derived","sz":1.1563786008230452},{"id":"mistral3","cls":"derived","sz":1.02880658436214},{"id":"mixtral","cls":"derived","sz":1.1358024691358024},{"id":"mlcd","cls":"derived","sz":1.0411522633744856},{"id":"mllama","cls":"base","sz":1.9465020576131686},{"id":"mm_grounding_dino","cls":"derived","sz":1.045267489711934},{"id":"mobilebert","cls":"cand","sz":1.9176954732510287},{"id":"mobilenet_v1","cls":"cand","sz":1.9135802469135803},{"id":"mobilenet_v2","cls":"cand","sz":1.9135802469135803},{"id":"mobilevit","cls":"cand","sz":1.9135802469135803},{"id":"mobilevitv2","cls":"cand","sz":1.9135802469135803},{"id":"modernbert","cls":"derived","sz":1.0329218106995885},{"id":"modernbert_decoder","cls":"derived","sz":1.0246913580246915},{"id":"moonshine","cls":"derived","sz":1.0329218106995885},{"id":"moshi","cls":"base","sz":1.9176954732510287},{"id":"mpnet","cls":"cand","sz":1.9176954732510287},{"id":"mpt","cls":"cand","sz":1.9176954732510287},{"id":"mra","cls":"cand","sz":1.9135802469135803},{"id":"mt5","cls":"cand","sz":1.9135802469135803},{"id":"musicgen","cls":"cand","sz":1.9094650205761317},{"id":"musicgen_melody","cls":"cand","sz":1.9135802469135803},{"id":"mvp","cls":"cand","sz":1.9176954732510287},{"id":"nemotron","cls":"base","sz":1.925925925925926},{"id":"nllb_moe","cls":"cand","sz":1.9094650205761317},{"id":"nystromformer","cls":"cand","sz":1.9176954732510287},{"id":"olmo","cls":"derived","sz":1.0576131687242798},{"id":"olmo2","cls":"derived","sz":1.0946502057613168},{"id":"olmo3","cls":"derived","sz":1.037037037037037},{"id":"olmoe","cls":"base","sz":1.9300411522633745},{"id":"omdet_turbo","cls":"cand","sz":1.9094650205761317},{"id":"oneformer","cls":"cand","sz":1.905349794238683},{"id":"openai","cls":"cand","sz":1.9176954732510287},{"id":"opt","cls":"base","sz":1.9218106995884774},{"id":"ovis2","cls":"derived","sz":1.0411522633744856},{"id":"owlvit","cls":"cand","sz":1.9135802469135803},{"id":"paligemma","cls":"base","sz":1.954732510288066},{"id":"patchtsmixer","cls":"cand","sz":1.9135802469135803},{"id":"patchtst","cls":"cand","sz":1.9135802469135803},{"id":"pegasus","cls":"cand","sz":1.9176954732510287},{"id":"pegasus_x","cls":"cand","sz":1.9135802469135803},{"id":"perceiver","cls":"cand","sz":1.9094650205761317},{"id":"perception_lm","cls":"derived","sz":1.0205761316872428},{"id":"persimmon","cls":"cand","sz":1.9176954732510287},{"id":"phi","cls":"derived","sz":1.037037037037037},{"id":"phi3","cls":"derived","sz":1.0658436213991769},{"id":"phi4_multimodal","cls":"derived","sz":1.0617283950617284},{"id":"phimoe","cls":"cand","sz":1.9094650205761317},{"id":"pix2struct","cls":"cand","sz":1.9135802469135803},{"id":"pixtral","cls":"base","sz":1.925925925925926},{"id":"plbart","cls":"derived","sz":1.02880658436214},{"id":"poolformer","cls":"cand","sz":1.9176954732510287},{"id":"pop2piano","cls":"cand","sz":1.9135802469135803},{"id":"prompt_depth_anything","cls":"derived","sz":1.0246913580246915},{"id":"prophetnet","cls":"cand","sz":1.9135802469135803},{"id":"pvt","cls":"cand","sz":1.9176954732510287},{"id":"pvt_v2","cls":"cand","sz":1.9176954732510287},{"id":"qwen2","cls":"derived","sz":1.139917695473251},{"id":"qwen2_5_omni","cls":"derived","sz":1.0658436213991769},{"id":"qwen2_5_vl","cls":"derived","sz":1.139917695473251},{"id":"qwen2_audio","cls":"base","sz":1.9382716049382716},{"id":"qwen2_moe","cls":"base","sz":1.9218106995884774},{"id":"qwen2_vl","cls":"base","sz":1.991769547325103},{"id":"qwen3","cls":"derived","sz":1.1275720164609053},{"id":"qwen3_moe","cls":"derived","sz":1.1069958847736625},{"id":"qwen3_next","cls":"derived","sz":1.0534979423868314},{"id":"qwen3_omni_moe","cls":"derived","sz":1.1069958847736625},{"id":"qwen3_vl","cls":"derived","sz":1.0823045267489713},{"id":"qwen3_vl_moe","cls":"derived","sz":1.0493827160493827},{"id":"rag","cls":"cand","sz":1.9176954732510287},{"id":"recurrent_gemma","cls":"cand","sz":1.9135802469135803},{"id":"reformer","cls":"cand","sz":1.9094650205761317},{"id":"regnet","cls":"cand","sz":1.9135802469135803},{"id":"rembert","cls":"cand","sz":1.9176954732510287},{"id":"resnet","cls":"cand","sz":1.9135802469135803},{"id":"roberta","cls":"derived","sz":1.1275720164609053},{"id":"roberta_prelayernorm","cls":"cand","sz":1.9176954732510287},{"id":"roc_bert","cls":"cand","sz":1.9135802469135803},{"id":"roformer","cls":"cand","sz":1.9176954732510287},{"id":"rt_detr","cls":"base","sz":1.0781893004115226},{"id":"rt_detr_v2","cls":"derived","sz":1.02880658436214},{"id":"rwkv","cls":"cand","sz":1.9176954732510287},{"id":"sam","cls":"base","sz":2.016460905349794},{"id":"sam2","cls":"derived","sz":1.074074074074074},{"id":"sam2_video","cls":"derived","sz":1.02880658436214},{"id":"sam_hq","cls":"derived","sz":1.045267489711934},{"id":"seamless_m4t","cls":"cand","sz":1.9094650205761317},{"id":"seamless_m4t_v2","cls":"cand","sz":1.9094650205761317},{"id":"seed_oss","cls":"derived","sz":1.0411522633744856},{"id":"seggpt","cls":"cand","sz":1.9135802469135803},{"id":"sew","cls":"derived","sz":1.045267489711934},{"id":"sew_d","cls":"cand","sz":1.9135802469135803},{"id":"shieldgemma2","cls":"cand","sz":1.5308641975308643},{"id":"siglip","cls":"base","sz":2.0411522633744856},{"id":"siglip2","cls":"derived","sz":1.0534979423868314},{"id":"smollm3","cls":"derived","sz":1.0411522633744856},{"id":"smolvlm","cls":"derived","sz":1.0205761316872428},{"id":"speech_encoder_decoder","cls":"cand","sz":1.9135802469135803},{"id":"speech_to_text","cls":"cand","sz":1.9176954732510287},{"id":"speecht5","cls":"cand","sz":1.9094650205761317},{"id":"splinter","cls":"cand","sz":1.9176954732510287},{"id":"squeezebert","cls":"cand","sz":1.9176954732510287},{"id":"stablelm","cls":"cand","sz":1.9176954732510287},{"id":"starcoder2","cls":"derived","sz":1.037037037037037},{"id":"superglue","cls":"cand","sz":1.9176954732510287},{"id":"superpoint","cls":"cand","sz":1.9176954732510287},{"id":"swiftformer","cls":"cand","sz":1.9176954732510287},{"id":"swin","cls":"cand","sz":1.9135802469135803},{"id":"swin2sr","cls":"cand","sz":1.9135802469135803},{"id":"swinv2","cls":"cand","sz":1.9135802469135803},{"id":"switch_transformers","cls":"cand","sz":1.9094650205761317},{"id":"t5","cls":"cand","sz":1.9135802469135803},{"id":"t5gemma","cls":"derived","sz":1.0329218106995885},{"id":"table_transformer","cls":"cand","sz":1.9135802469135803},{"id":"tapas","cls":"cand","sz":1.9094650205761317},{"id":"textnet","cls":"cand","sz":1.905349794238683},{"id":"time_series_transformer","cls":"base","sz":1.9629629629629628},{"id":"timesfm","cls":"derived","sz":1.008230452674897},{"id":"timesformer","cls":"cand","sz":1.9176954732510287},{"id":"timm_backbone","cls":"cand","sz":1.7613168724279835},{"id":"timm_wrapper","cls":"cand","sz":1.9135802469135803},{"id":"trocr","cls":"cand","sz":1.9176954732510287},{"id":"tvp","cls":"cand","sz":1.9176954732510287},{"id":"udop","cls":"cand","sz":1.9094650205761317},{"id":"umt5","cls":"cand","sz":1.9135802469135803},{"id":"unispeech","cls":"derived","sz":1.037037037037037},{"id":"unispeech_sat","cls":"derived","sz":1.045267489711934},{"id":"univnet","cls":"cand","sz":1.9176954732510287},{"id":"upernet","cls":"cand","sz":1.9135802469135803},{"id":"vaultgemma","cls":"derived","sz":1.0205761316872428},{"id":"video_llava","cls":"cand","sz":1.9176954732510287},{"id":"videomae","cls":"cand","sz":1.9135802469135803},{"id":"vilt","cls":"cand","sz":1.9176954732510287},{"id":"vipllava","cls":"derived","sz":1.0205761316872428},{"id":"vision_encoder_decoder","cls":"cand","sz":1.9135802469135803},{"id":"vision_text_dual_encoder","cls":"cand","sz":1.905349794238683},{"id":"visual_bert","cls":"cand","sz":1.9176954732510287},{"id":"vit","cls":"base","sz":1.9300411522633745},{"id":"vit_mae","cls":"cand","sz":1.9135802469135803},{"id":"vit_msn","cls":"cand","sz":1.9176954732510287},{"id":"vitdet","cls":"base","sz":1.9218106995884774},{"id":"vitmatte","cls":"cand","sz":1.9094650205761317},{"id":"vitpose","cls":"cand","sz":1.9094650205761317},{"id":"vitpose_backbone","cls":"cand","sz":1.9176954732510287},{"id":"vits","cls":"cand","sz":1.9094650205761317},{"id":"vivit","cls":"cand","sz":1.9176954732510287},{"id":"vjepa2","cls":"cand","sz":1.9135802469135803},{"id":"voxtral","cls":"derived","sz":1.0164609053497942},{"id":"wav2vec2","cls":"base","sz":2.213991769547325},{"id":"wav2vec2_bert","cls":"derived","sz":1.037037037037037},{"id":"wav2vec2_conformer","cls":"derived","sz":1.0781893004115226},{"id":"wavlm","cls":"derived","sz":1.037037037037037},{"id":"whisper","cls":"base","sz":1.925925925925926},{"id":"x_clip","cls":"cand","sz":1.9176954732510287},{"id":"xcodec","cls":"cand","sz":1.9135802469135803},{"id":"xglm","cls":"cand","sz":1.9176954732510287},{"id":"xlm","cls":"cand","sz":1.9176954732510287},{"id":"xlm_roberta","cls":"derived","sz":1.0329218106995885},{"id":"xlm_roberta_xl","cls":"derived","sz":1.0329218106995885},{"id":"xlnet","cls":"cand","sz":1.9135802469135803},{"id":"xlstm","cls":"cand","sz":1.9094650205761317},{"id":"xmod","cls":"cand","sz":1.9135802469135803},{"id":"yoso","cls":"cand","sz":1.9176954732510287},{"id":"zamba","cls":"base","sz":1.954732510288066},{"id":"zamba2","cls":"derived","sz":1.0617283950617284},{"id":"zoedepth","cls":"cand","sz":1.9135802469135803}],"links":[{"source":"llama","target":"zamba2","label":"2 imports","cand":false},{"source":"llama","target":"zamba2","label":"2 imports","cand":false},{"source":"mamba2","target":"zamba2","label":"3 imports","cand":false},{"source":"mamba2","target":"zamba2","label":"3 imports","cand":false},{"source":"mamba2","target":"zamba2","label":"3 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"zamba","target":"zamba2","label":"10 imports","cand":false},{"source":"bert","target":"xlm_roberta_xl","label":"5 imports","cand":false},{"source":"bert","target":"xlm_roberta_xl","label":"5 imports","cand":false},{"source":"bert","target":"xlm_roberta_xl","label":"5 imports","cand":false},{"source":"bert","target":"xlm_roberta_xl","label":"5 imports","cand":false},{"source":"bert","target":"xlm_roberta_xl","label":"5 imports","cand":false},{"source":"roberta","target":"xlm_roberta_xl","label":"3 imports","cand":false},{"source":"roberta","target":"xlm_roberta_xl","label":"3 imports","cand":false},{"source":"roberta","target":"xlm_roberta_xl","label":"3 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"roberta","target":"xlm_roberta","label":"8 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wavlm","label":"9 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_conformer","label":"13 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_bert","label":"3 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_bert","label":"3 imports","cand":false},{"source":"wav2vec2","target":"wav2vec2_bert","label":"3 imports","cand":false},{"source":"wav2vec2_conformer","target":"wav2vec2_bert","label":"6 imports","cand":false},{"source":"wav2vec2_conformer","target":"wav2vec2_bert","label":"6 imports","cand":false},{"source":"wav2vec2_conformer","target":"wav2vec2_bert","label":"6 imports","cand":false},{"source":"wav2vec2_conformer","target":"wav2vec2_bert","label":"6 imports","cand":false},{"source":"wav2vec2_conformer","target":"wav2vec2_bert","label":"6 imports","cand":false},{"source":"wav2vec2_conformer","target":"wav2vec2_bert","label":"6 imports","cand":false},{"source":"qwen2_audio","target":"voxtral","label":"4 imports","cand":false},{"source":"qwen2_audio","target":"voxtral","label":"4 imports","cand":false},{"source":"qwen2_audio","target":"voxtral","label":"4 imports","cand":false},{"source":"qwen2_audio","target":"voxtral","label":"4 imports","cand":false},{"source":"llava","target":"vipllava","label":"5 imports","cand":false},{"source":"llava","target":"vipllava","label":"5 imports","cand":false},{"source":"llava","target":"vipllava","label":"5 imports","cand":false},{"source":"llava","target":"vipllava","label":"5 imports","cand":false},{"source":"llava","target":"vipllava","label":"5 imports","cand":false},{"source":"gemma2","target":"vaultgemma","label":"5 imports","cand":false},{"source":"gemma2","target":"vaultgemma","label":"5 imports","cand":false},{"source":"gemma2","target":"vaultgemma","label":"5 imports","cand":false},{"source":"gemma2","target":"vaultgemma","label":"5 imports","cand":false},{"source":"gemma2","target":"vaultgemma","label":"5 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech_sat","label":"11 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"wav2vec2","target":"unispeech","label":"9 imports","cand":false},{"source":"llama","target":"timesfm","label":"1 imports","cand":false},{"source":"phi4_multimodal","target":"timesfm","label":"1 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"gemma2","target":"t5gemma","label":"8 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"mistral","target":"starcoder2","label":"9 imports","cand":false},{"source":"idefics3","target":"smolvlm","label":"5 imports","cand":false},{"source":"idefics3","target":"smolvlm","label":"5 imports","cand":false},{"source":"idefics3","target":"smolvlm","label":"5 imports","cand":false},{"source":"idefics3","target":"smolvlm","label":"5 imports","cand":false},{"source":"idefics3","target":"smolvlm","label":"5 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"llama","target":"smollm3","label":"9 imports","cand":false},{"source":"qwen2","target":"smollm3","label":"1 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"siglip","target":"siglip2","label":"13 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"wav2vec2","target":"sew","label":"11 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"llama","target":"seed_oss","label":"10 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam","target":"sam_hq","label":"11 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"sam2","target":"sam2_video","label":"7 imports","cand":false},{"source":"maskformer","target":"sam2","label":"1 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"sam","target":"sam2","label":"8 imports","cand":false},{"source":"vitdet","target":"sam2","label":"2 imports","cand":false},{"source":"vitdet","target":"sam2","label":"2 imports","cand":false},{"source":"rt_detr","target":"rt_detr_v2","label":"6 imports","cand":false},{"source":"rt_detr","target":"rt_detr_v2","label":"6 imports","cand":false},{"source":"rt_detr","target":"rt_detr_v2","label":"6 imports","cand":false},{"source":"rt_detr","target":"rt_detr_v2","label":"6 imports","cand":false},{"source":"rt_detr","target":"rt_detr_v2","label":"6 imports","cand":false},{"source":"rt_detr","target":"rt_detr_v2","label":"6 imports","cand":false},{"source":"bert","target":"roberta","label":"5 imports","cand":false},{"source":"bert","target":"roberta","label":"5 imports","cand":false},{"source":"bert","target":"roberta","label":"5 imports","cand":false},{"source":"bert","target":"roberta","label":"5 imports","cand":false},{"source":"bert","target":"roberta","label":"5 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_vl_moe","label":"3 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_vl_moe","label":"3 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_vl_moe","label":"3 imports","cand":false},{"source":"qwen3_vl","target":"qwen3_vl_moe","label":"5 imports","cand":false},{"source":"qwen3_vl","target":"qwen3_vl_moe","label":"5 imports","cand":false},{"source":"qwen3_vl","target":"qwen3_vl_moe","label":"5 imports","cand":false},{"source":"qwen3_vl","target":"qwen3_vl_moe","label":"5 imports","cand":false},{"source":"qwen3_vl","target":"qwen3_vl_moe","label":"5 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen3_vl","label":"4 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen3_vl","label":"4 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen3_vl","label":"4 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen3_vl","label":"4 imports","cand":false},{"source":"qwen2_vl","target":"qwen3_vl","label":"6 imports","cand":false},{"source":"qwen2_vl","target":"qwen3_vl","label":"6 imports","cand":false},{"source":"qwen2_vl","target":"qwen3_vl","label":"6 imports","cand":false},{"source":"qwen2_vl","target":"qwen3_vl","label":"6 imports","cand":false},{"source":"qwen2_vl","target":"qwen3_vl","label":"6 imports","cand":false},{"source":"qwen2_vl","target":"qwen3_vl","label":"6 imports","cand":false},{"source":"qwen3","target":"qwen3_vl","label":"5 imports","cand":false},{"source":"qwen3","target":"qwen3_vl","label":"5 imports","cand":false},{"source":"qwen3","target":"qwen3_vl","label":"5 imports","cand":false},{"source":"qwen3","target":"qwen3_vl","label":"5 imports","cand":false},{"source":"qwen3","target":"qwen3_vl","label":"5 imports","cand":false},{"source":"mimi","target":"qwen3_omni_moe","label":"1 imports","cand":false},{"source":"qwen2_5_omni","target":"qwen3_omni_moe","label":"6 imports","cand":false},{"source":"qwen2_5_omni","target":"qwen3_omni_moe","label":"6 imports","cand":false},{"source":"qwen2_5_omni","target":"qwen3_omni_moe","label":"6 imports","cand":false},{"source":"qwen2_5_omni","target":"qwen3_omni_moe","label":"6 imports","cand":false},{"source":"qwen2_5_omni","target":"qwen3_omni_moe","label":"6 imports","cand":false},{"source":"qwen2_5_omni","target":"qwen3_omni_moe","label":"6 imports","cand":false},{"source":"qwen2_moe","target":"qwen3_omni_moe","label":"1 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_omni_moe","label":"7 imports","cand":false},{"source":"qwen3_vl_moe","target":"qwen3_omni_moe","label":"4 imports","cand":false},{"source":"qwen3_vl_moe","target":"qwen3_omni_moe","label":"4 imports","cand":false},{"source":"qwen3_vl_moe","target":"qwen3_omni_moe","label":"4 imports","cand":false},{"source":"qwen3_vl_moe","target":"qwen3_omni_moe","label":"4 imports","cand":false},{"source":"bamba","target":"qwen3_next","label":"2 imports","cand":false},{"source":"bamba","target":"qwen3_next","label":"2 imports","cand":false},{"source":"gemma3","target":"qwen3_next","label":"1 imports","cand":false},{"source":"llama","target":"qwen3_next","label":"3 imports","cand":false},{"source":"llama","target":"qwen3_next","label":"3 imports","cand":false},{"source":"llama","target":"qwen3_next","label":"3 imports","cand":false},{"source":"mixtral","target":"qwen3_next","label":"1 imports","cand":false},{"source":"qwen2_moe","target":"qwen3_next","label":"1 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_next","label":"5 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_next","label":"5 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_next","label":"5 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_next","label":"5 imports","cand":false},{"source":"qwen3_moe","target":"qwen3_next","label":"5 imports","cand":false},{"source":"llama","target":"qwen3_moe","label":"4 imports","cand":false},{"source":"llama","target":"qwen3_moe","label":"4 imports","cand":false},{"source":"llama","target":"qwen3_moe","label":"4 imports","cand":false},{"source":"llama","target":"qwen3_moe","label":"4 imports","cand":false},{"source":"mixtral","target":"qwen3_moe","label":"3 imports","cand":false},{"source":"mixtral","target":"qwen3_moe","label":"3 imports","cand":false},{"source":"mixtral","target":"qwen3_moe","label":"3 imports","cand":false},{"source":"qwen2_moe","target":"qwen3_moe","label":"1 imports","cand":false},{"source":"qwen3","target":"qwen3_moe","label":"1 imports","cand":false},{"source":"gemma","target":"qwen3","label":"1 imports","cand":false},{"source":"llama","target":"qwen3","label":"1 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2","target":"qwen3","label":"10 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_vl","label":"11 imports","cand":false},{"source":"llama","target":"qwen2_5_omni","label":"1 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_5_vl","target":"qwen2_5_omni","label":"7 imports","cand":false},{"source":"qwen2_audio","target":"qwen2_5_omni","label":"1 imports","cand":false},{"source":"qwen2_vl","target":"qwen2_5_omni","label":"1 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"llama","target":"qwen2","label":"10 imports","cand":false},{"source":"mistral","target":"qwen2","label":"1 imports","cand":false},{"source":"depth_anything","target":"prompt_depth_anything","label":"6 imports","cand":false},{"source":"depth_anything","target":"prompt_depth_anything","label":"6 imports","cand":false},{"source":"depth_anything","target":"prompt_depth_anything","label":"6 imports","cand":false},{"source":"depth_anything","target":"prompt_depth_anything","label":"6 imports","cand":false},{"source":"depth_anything","target":"prompt_depth_anything","label":"6 imports","cand":false},{"source":"depth_anything","target":"prompt_depth_anything","label":"6 imports","cand":false},{"source":"bart","target":"plbart","label":"5 imports","cand":false},{"source":"bart","target":"plbart","label":"5 imports","cand":false},{"source":"bart","target":"plbart","label":"5 imports","cand":false},{"source":"bart","target":"plbart","label":"5 imports","cand":false},{"source":"bart","target":"plbart","label":"5 imports","cand":false},{"source":"bigbird_pegasus","target":"plbart","label":"1 imports","cand":false},{"source":"mbart","target":"plbart","label":"1 imports","cand":false},{"source":"phi3","target":"phi4_multimodal","label":"6 imports","cand":false},{"source":"phi3","target":"phi4_multimodal","label":"6 imports","cand":false},{"source":"phi3","target":"phi4_multimodal","label":"6 imports","cand":false},{"source":"phi3","target":"phi4_multimodal","label":"6 imports","cand":false},{"source":"phi3","target":"phi4_multimodal","label":"6 imports","cand":false},{"source":"phi3","target":"phi4_multimodal","label":"6 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"siglip","target":"phi4_multimodal","label":"8 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"mistral","target":"phi3","label":"7 imports","cand":false},{"source":"clip","target":"phi","label":"1 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llama","target":"phi","label":"8 imports","cand":false},{"source":"llava","target":"perception_lm","label":"5 imports","cand":false},{"source":"llava","target":"perception_lm","label":"5 imports","cand":false},{"source":"llava","target":"perception_lm","label":"5 imports","cand":false},{"source":"llava","target":"perception_lm","label":"5 imports","cand":false},{"source":"llava","target":"perception_lm","label":"5 imports","cand":false},{"source":"aimv2","target":"ovis2","label":"2 imports","cand":false},{"source":"aimv2","target":"ovis2","label":"2 imports","cand":false},{"source":"llama","target":"ovis2","label":"2 imports","cand":false},{"source":"llama","target":"ovis2","label":"2 imports","cand":false},{"source":"llava","target":"ovis2","label":"2 imports","cand":false},{"source":"llava","target":"ovis2","label":"2 imports","cand":false},{"source":"llava_next","target":"ovis2","label":"2 imports","cand":false},{"source":"llava_next","target":"ovis2","label":"2 imports","cand":false},{"source":"siglip","target":"ovis2","label":"2 imports","cand":false},{"source":"siglip","target":"ovis2","label":"2 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"olmo2","target":"olmo3","label":"9 imports","cand":false},{"source":"llama","target":"olmo2","label":"3 imports","cand":false},{"source":"llama","target":"olmo2","label":"3 imports","cand":false},{"source":"llama","target":"olmo2","label":"3 imports","cand":false},{"source":"olmo","target":"olmo2","label":"6 imports","cand":false},{"source":"olmo","target":"olmo2","label":"6 imports","cand":false},{"source":"olmo","target":"olmo2","label":"6 imports","cand":false},{"source":"olmo","target":"olmo2","label":"6 imports","cand":false},{"source":"olmo","target":"olmo2","label":"6 imports","cand":false},{"source":"olmo","target":"olmo2","label":"6 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"llama","target":"olmo","label":"8 imports","cand":false},{"source":"glm","target":"moonshine","label":"3 imports","cand":false},{"source":"glm","target":"moonshine","label":"3 imports","cand":false},{"source":"glm","target":"moonshine","label":"3 imports","cand":false},{"source":"llama","target":"moonshine","label":"3 imports","cand":false},{"source":"llama","target":"moonshine","label":"3 imports","cand":false},{"source":"llama","target":"moonshine","label":"3 imports","cand":false},{"source":"whisper","target":"moonshine","label":"2 imports","cand":false},{"source":"whisper","target":"moonshine","label":"2 imports","cand":false},{"source":"modernbert","target":"modernbert_decoder","label":"6 imports","cand":false},{"source":"modernbert","target":"modernbert_decoder","label":"6 imports","cand":false},{"source":"modernbert","target":"modernbert_decoder","label":"6 imports","cand":false},{"source":"modernbert","target":"modernbert_decoder","label":"6 imports","cand":false},{"source":"modernbert","target":"modernbert_decoder","label":"6 imports","cand":false},{"source":"modernbert","target":"modernbert_decoder","label":"6 imports","cand":false},{"source":"gemma","target":"modernbert","label":"2 imports","cand":false},{"source":"gemma","target":"modernbert","label":"2 imports","cand":false},{"source":"auto","target":"mm_grounding_dino","label":"1 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"grounding_dino","target":"mm_grounding_dino","label":"10 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"clip","target":"mlcd","label":"7 imports","cand":false},{"source":"llama","target":"mlcd","label":"1 imports","cand":false},{"source":"qwen2_vl","target":"mlcd","label":"2 imports","cand":false},{"source":"qwen2_vl","target":"mlcd","label":"2 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"mistral","target":"mixtral","label":"9 imports","cand":false},{"source":"llava","target":"mistral3","label":"6 imports","cand":false},{"source":"llava","target":"mistral3","label":"6 imports","cand":false},{"source":"llava","target":"mistral3","label":"6 imports","cand":false},{"source":"llava","target":"mistral3","label":"6 imports","cand":false},{"source":"llava","target":"mistral3","label":"6 imports","cand":false},{"source":"llava","target":"mistral3","label":"6 imports","cand":false},{"source":"mistral","target":"mistral3","label":"1 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"llama","target":"mistral","label":"10 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"qwen2","target":"ministral","label":"11 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"mixtral","target":"minimax","label":"10 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"clip","target":"metaclip_2","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"deepseek_v3","target":"longcat_flash","label":"11 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next_video","target":"llava_onevision","label":"9 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"llava_next","target":"llava_next_video","label":"7 imports","cand":false},{"source":"auto","target":"lightglue","label":"1 imports","cand":false},{"source":"clip","target":"lightglue","label":"1 imports","cand":false},{"source":"cohere","target":"lightglue","label":"1 imports","cand":false},{"source":"llama","target":"lightglue","label":"2 imports","cand":false},{"source":"llama","target":"lightglue","label":"2 imports","cand":false},{"source":"llava","target":"lfm2_vl","label":"5 imports","cand":false},{"source":"llava","target":"lfm2_vl","label":"5 imports","cand":false},{"source":"llava","target":"lfm2_vl","label":"5 imports","cand":false},{"source":"llava","target":"lfm2_vl","label":"5 imports","cand":false},{"source":"llava","target":"lfm2_vl","label":"5 imports","cand":false},{"source":"bamba","target":"lfm2","label":"1 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"lfm2","label":"8 imports","cand":false},{"source":"llama","target":"kyutai_speech_to_text","label":"1 imports","cand":false},{"source":"mimi","target":"kyutai_speech_to_text","label":"1 imports","cand":false},{"source":"moshi","target":"kyutai_speech_to_text","label":"2 imports","cand":false},{"source":"moshi","target":"kyutai_speech_to_text","label":"2 imports","cand":false},{"source":"blip_2","target":"janus","label":"1 imports","cand":false},{"source":"chameleon","target":"janus","label":"5 imports","cand":false},{"source":"chameleon","target":"janus","label":"5 imports","cand":false},{"source":"chameleon","target":"janus","label":"5 imports","cand":false},{"source":"chameleon","target":"janus","label":"5 imports","cand":false},{"source":"chameleon","target":"janus","label":"5 imports","cand":false},{"source":"idefics","target":"janus","label":"2 imports","cand":false},{"source":"idefics","target":"janus","label":"2 imports","cand":false},{"source":"llama","target":"janus","label":"1 imports","cand":false},{"source":"siglip","target":"janus","label":"3 imports","cand":false},{"source":"siglip","target":"janus","label":"3 imports","cand":false},{"source":"siglip","target":"janus","label":"3 imports","cand":false},{"source":"clip","target":"internvl","label":"1 imports","cand":false},{"source":"janus","target":"internvl","label":"1 imports","cand":false},{"source":"llama","target":"internvl","label":"1 imports","cand":false},{"source":"llava","target":"internvl","label":"5 imports","cand":false},{"source":"llava","target":"internvl","label":"5 imports","cand":false},{"source":"llava","target":"internvl","label":"5 imports","cand":false},{"source":"llava","target":"internvl","label":"5 imports","cand":false},{"source":"llava","target":"internvl","label":"5 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"instructblip","target":"instructblipvideo","label":"7 imports","cand":false},{"source":"auto","target":"instructblipvideo","label":"1 imports","cand":false},{"source":"bart","target":"informer","label":"1 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"time_series_transformer","target":"informer","label":"12 imports","cand":false},{"source":"vit","target":"ijepa","label":"4 imports","cand":false},{"source":"vit","target":"ijepa","label":"4 imports","cand":false},{"source":"vit","target":"ijepa","label":"4 imports","cand":false},{"source":"vit","target":"ijepa","label":"4 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_moe","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"llama","target":"hunyuan_v1_dense","label":"10 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"wav2vec2","target":"hubert","label":"7 imports","cand":false},{"source":"rt_detr","target":"hgnet_v2","label":"1 imports","cand":false},{"source":"gemma","target":"helium","label":"3 imports","cand":false},{"source":"gemma","target":"helium","label":"3 imports","cand":false},{"source":"gemma","target":"helium","label":"3 imports","cand":false},{"source":"granite","target":"helium","label":"1 imports","cand":false},{"source":"llama","target":"helium","label":"5 imports","cand":false},{"source":"llama","target":"helium","label":"5 imports","cand":false},{"source":"llama","target":"helium","label":"5 imports","cand":false},{"source":"llama","target":"helium","label":"5 imports","cand":false},{"source":"llama","target":"helium","label":"5 imports","cand":false},{"source":"granitemoe","target":"granitemoeshared","label":"4 imports","cand":false},{"source":"granitemoe","target":"granitemoeshared","label":"4 imports","cand":false},{"source":"granitemoe","target":"granitemoeshared","label":"4 imports","cand":false},{"source":"granitemoe","target":"granitemoeshared","label":"4 imports","cand":false},{"source":"bamba","target":"granitemoehybrid","label":"3 imports","cand":false},{"source":"bamba","target":"granitemoehybrid","label":"3 imports","cand":false},{"source":"bamba","target":"granitemoehybrid","label":"3 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"granitemoeshared","target":"granitemoehybrid","label":"7 imports","cand":false},{"source":"llama","target":"granite","label":"5 imports","cand":false},{"source":"llama","target":"granite","label":"5 imports","cand":false},{"source":"llama","target":"granite","label":"5 imports","cand":false},{"source":"llama","target":"granite","label":"5 imports","cand":false},{"source":"llama","target":"granite","label":"5 imports","cand":false},{"source":"llama","target":"gpt_oss","label":"5 imports","cand":false},{"source":"llama","target":"gpt_oss","label":"5 imports","cand":false},{"source":"llama","target":"gpt_oss","label":"5 imports","cand":false},{"source":"llama","target":"gpt_oss","label":"5 imports","cand":false},{"source":"llama","target":"gpt_oss","label":"5 imports","cand":false},{"source":"mixtral","target":"gpt_oss","label":"4 imports","cand":false},{"source":"mixtral","target":"gpt_oss","label":"4 imports","cand":false},{"source":"mixtral","target":"gpt_oss","label":"4 imports","cand":false},{"source":"mixtral","target":"gpt_oss","label":"4 imports","cand":false},{"source":"qwen2","target":"gpt_oss","label":"1 imports","cand":false},{"source":"llama","target":"gpt_neox","label":"4 imports","cand":false},{"source":"llama","target":"gpt_neox","label":"4 imports","cand":false},{"source":"llama","target":"gpt_neox","label":"4 imports","cand":false},{"source":"llama","target":"gpt_neox","label":"4 imports","cand":false},{"source":"llava","target":"got_ocr2","label":"6 imports","cand":false},{"source":"llava","target":"got_ocr2","label":"6 imports","cand":false},{"source":"llava","target":"got_ocr2","label":"6 imports","cand":false},{"source":"llava","target":"got_ocr2","label":"6 imports","cand":false},{"source":"llava","target":"got_ocr2","label":"6 imports","cand":false},{"source":"llava","target":"got_ocr2","label":"6 imports","cand":false},{"source":"sam","target":"got_ocr2","label":"5 imports","cand":false},{"source":"sam","target":"got_ocr2","label":"5 imports","cand":false},{"source":"sam","target":"got_ocr2","label":"5 imports","cand":false},{"source":"sam","target":"got_ocr2","label":"5 imports","cand":false},{"source":"sam","target":"got_ocr2","label":"5 imports","cand":false},{"source":"glm4","target":"glm4v_moe","label":"1 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4_moe","target":"glm4v_moe","label":"7 imports","cand":false},{"source":"glm4v","target":"glm4v_moe","label":"2 imports","cand":false},{"source":"glm4v","target":"glm4v_moe","label":"2 imports","cand":false},{"source":"glm4","target":"glm4v","label":"3 imports","cand":false},{"source":"glm4","target":"glm4v","label":"3 imports","cand":false},{"source":"glm4","target":"glm4v","label":"3 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"qwen2_5_vl","target":"glm4v","label":"12 imports","cand":false},{"source":"cohere","target":"glm4_moe","label":"1 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"deepseek_v3","target":"glm4_moe","label":"7 imports","cand":false},{"source":"gpt_neox","target":"glm4_moe","label":"1 imports","cand":false},{"source":"glm","target":"glm4","label":"4 imports","cand":false},{"source":"glm","target":"glm4","label":"4 imports","cand":false},{"source":"glm","target":"glm4","label":"4 imports","cand":false},{"source":"glm","target":"glm4","label":"4 imports","cand":false},{"source":"phi3","target":"glm4","label":"1 imports","cand":false},{"source":"llama","target":"glm","label":"4 imports","cand":false},{"source":"llama","target":"glm","label":"4 imports","cand":false},{"source":"llama","target":"glm","label":"4 imports","cand":false},{"source":"llama","target":"glm","label":"4 imports","cand":false},{"source":"phi3","target":"glm","label":"1 imports","cand":false},{"source":"gemma2","target":"gemma3n","label":"5 imports","cand":false},{"source":"gemma2","target":"gemma3n","label":"5 imports","cand":false},{"source":"gemma2","target":"gemma3n","label":"5 imports","cand":false},{"source":"gemma2","target":"gemma3n","label":"5 imports","cand":false},{"source":"gemma2","target":"gemma3n","label":"5 imports","cand":false},{"source":"gemma3","target":"gemma3n","label":"6 imports","cand":false},{"source":"gemma3","target":"gemma3n","label":"6 imports","cand":false},{"source":"gemma3","target":"gemma3n","label":"6 imports","cand":false},{"source":"gemma3","target":"gemma3n","label":"6 imports","cand":false},{"source":"gemma3","target":"gemma3n","label":"6 imports","cand":false},{"source":"gemma3","target":"gemma3n","label":"6 imports","cand":false},{"source":"paligemma","target":"gemma3n","label":"4 imports","cand":false},{"source":"paligemma","target":"gemma3n","label":"4 imports","cand":false},{"source":"paligemma","target":"gemma3n","label":"4 imports","cand":false},{"source":"paligemma","target":"gemma3n","label":"4 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"gemma2","target":"gemma3","label":"9 imports","cand":false},{"source":"paligemma","target":"gemma3","label":"5 imports","cand":false},{"source":"paligemma","target":"gemma3","label":"5 imports","cand":false},{"source":"paligemma","target":"gemma3","label":"5 imports","cand":false},{"source":"paligemma","target":"gemma3","label":"5 imports","cand":false},{"source":"paligemma","target":"gemma3","label":"5 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"gemma","target":"gemma2","label":"11 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"llama","target":"gemma","label":"9 imports","cand":false},{"source":"bart","target":"florence2","label":"2 imports","cand":false},{"source":"bart","target":"florence2","label":"2 imports","cand":false},{"source":"beit","target":"florence2","label":"1 imports","cand":false},{"source":"llama4","target":"florence2","label":"1 imports","cand":false},{"source":"llava","target":"florence2","label":"3 imports","cand":false},{"source":"llava","target":"florence2","label":"3 imports","cand":false},{"source":"llava","target":"florence2","label":"3 imports","cand":false},{"source":"mixtral","target":"flex_olmo","label":"2 imports","cand":false},{"source":"mixtral","target":"flex_olmo","label":"2 imports","cand":false},{"source":"olmo2","target":"flex_olmo","label":"3 imports","cand":false},{"source":"olmo2","target":"flex_olmo","label":"3 imports","cand":false},{"source":"olmo2","target":"flex_olmo","label":"3 imports","cand":false},{"source":"olmoe","target":"flex_olmo","label":"4 imports","cand":false},{"source":"olmoe","target":"flex_olmo","label":"4 imports","cand":false},{"source":"olmoe","target":"flex_olmo","label":"4 imports","cand":false},{"source":"olmoe","target":"flex_olmo","label":"4 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"mamba","target":"falcon_mamba","label":"10 imports","cand":false},{"source":"jamba","target":"falcon_h1","label":"1 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"llama","target":"falcon_h1","label":"7 imports","cand":false},{"source":"mamba2","target":"falcon_h1","label":"4 imports","cand":false},{"source":"mamba2","target":"falcon_h1","label":"4 imports","cand":false},{"source":"mamba2","target":"falcon_h1","label":"4 imports","cand":false},{"source":"mamba2","target":"falcon_h1","label":"4 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"llama","target":"exaone4","label":"10 imports","cand":false},{"source":"olmo2","target":"exaone4","label":"2 imports","cand":false},{"source":"olmo2","target":"exaone4","label":"2 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"esm","target":"evolla","label":"9 imports","cand":false},{"source":"llama","target":"evolla","label":"6 imports","cand":false},{"source":"llama","target":"evolla","label":"6 imports","cand":false},{"source":"llama","target":"evolla","label":"6 imports","cand":false},{"source":"llama","target":"evolla","label":"6 imports","cand":false},{"source":"llama","target":"evolla","label":"6 imports","cand":false},{"source":"llama","target":"evolla","label":"6 imports","cand":false},{"source":"ernie4_5","target":"ernie4_5_moe","label":"3 imports","cand":false},{"source":"ernie4_5","target":"ernie4_5_moe","label":"3 imports","cand":false},{"source":"ernie4_5","target":"ernie4_5_moe","label":"3 imports","cand":false},{"source":"llama","target":"ernie4_5_moe","label":"2 imports","cand":false},{"source":"llama","target":"ernie4_5_moe","label":"2 imports","cand":false},{"source":"mixtral","target":"ernie4_5_moe","label":"2 imports","cand":false},{"source":"mixtral","target":"ernie4_5_moe","label":"2 imports","cand":false},{"source":"qwen3_moe","target":"ernie4_5_moe","label":"2 imports","cand":false},{"source":"qwen3_moe","target":"ernie4_5_moe","label":"2 imports","cand":false},{"source":"glm","target":"ernie4_5","label":"1 imports","cand":false},{"source":"llama","target":"ernie4_5","label":"4 imports","cand":false},{"source":"llama","target":"ernie4_5","label":"4 imports","cand":false},{"source":"llama","target":"ernie4_5","label":"4 imports","cand":false},{"source":"llama","target":"ernie4_5","label":"4 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"bert","target":"ernie","label":"17 imports","cand":false},{"source":"dinov2","target":"eomt","label":"4 imports","cand":false},{"source":"dinov2","target":"eomt","label":"4 imports","cand":false},{"source":"dinov2","target":"eomt","label":"4 imports","cand":false},{"source":"dinov2","target":"eomt","label":"4 imports","cand":false},{"source":"mask2former","target":"eomt","label":"2 imports","cand":false},{"source":"mask2former","target":"eomt","label":"2 imports","cand":false},{"source":"siglip","target":"eomt","label":"1 imports","cand":false},{"source":"chameleon","target":"emu3","label":"2 imports","cand":false},{"source":"chameleon","target":"emu3","label":"2 imports","cand":false},{"source":"llama","target":"emu3","label":"5 imports","cand":false},{"source":"llama","target":"emu3","label":"5 imports","cand":false},{"source":"llama","target":"emu3","label":"5 imports","cand":false},{"source":"llama","target":"emu3","label":"5 imports","cand":false},{"source":"llama","target":"emu3","label":"5 imports","cand":false},{"source":"siglip","target":"emu3","label":"1 imports","cand":false},{"source":"deepseek_v3","target":"dots1","label":"5 imports","cand":false},{"source":"deepseek_v3","target":"dots1","label":"5 imports","cand":false},{"source":"deepseek_v3","target":"dots1","label":"5 imports","cand":false},{"source":"deepseek_v3","target":"dots1","label":"5 imports","cand":false},{"source":"deepseek_v3","target":"dots1","label":"5 imports","cand":false},{"source":"qwen3","target":"dots1","label":"6 imports","cand":false},{"source":"qwen3","target":"dots1","label":"6 imports","cand":false},{"source":"qwen3","target":"dots1","label":"6 imports","cand":false},{"source":"qwen3","target":"dots1","label":"6 imports","cand":false},{"source":"qwen3","target":"dots1","label":"6 imports","cand":false},{"source":"qwen3","target":"dots1","label":"6 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"llama","target":"doge","label":"8 imports","cand":false},{"source":"mixtral","target":"doge","label":"2 imports","cand":false},{"source":"mixtral","target":"doge","label":"2 imports","cand":false},{"source":"arcee","target":"dinov3_vit","label":"1 imports","cand":false},{"source":"dinov2","target":"dinov3_vit","label":"4 imports","cand":false},{"source":"dinov2","target":"dinov3_vit","label":"4 imports","cand":false},{"source":"dinov2","target":"dinov3_vit","label":"4 imports","cand":false},{"source":"dinov2","target":"dinov3_vit","label":"4 imports","cand":false},{"source":"llama","target":"dinov3_vit","label":"1 imports","cand":false},{"source":"pixtral","target":"dinov3_vit","label":"2 imports","cand":false},{"source":"pixtral","target":"dinov3_vit","label":"2 imports","cand":false},{"source":"dinov2","target":"dinov2_with_registers","label":"6 imports","cand":false},{"source":"dinov2","target":"dinov2_with_registers","label":"6 imports","cand":false},{"source":"dinov2","target":"dinov2_with_registers","label":"6 imports","cand":false},{"source":"dinov2","target":"dinov2_with_registers","label":"6 imports","cand":false},{"source":"dinov2","target":"dinov2_with_registers","label":"6 imports","cand":false},{"source":"dinov2","target":"dinov2_with_registers","label":"6 imports","cand":false},{"source":"gemma","target":"diffllama","label":"1 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"llama","target":"diffllama","label":"8 imports","cand":false},{"source":"mistral","target":"diffllama","label":"1 imports","cand":false},{"source":"llama","target":"dia","label":"4 imports","cand":false},{"source":"llama","target":"dia","label":"4 imports","cand":false},{"source":"llama","target":"dia","label":"4 imports","cand":false},{"source":"llama","target":"dia","label":"4 imports","cand":false},{"source":"phi3","target":"dia","label":"1 imports","cand":false},{"source":"deepseek_vl","target":"deepseek_vl_hybrid","label":"3 imports","cand":false},{"source":"deepseek_vl","target":"deepseek_vl_hybrid","label":"3 imports","cand":false},{"source":"deepseek_vl","target":"deepseek_vl_hybrid","label":"3 imports","cand":false},{"source":"idefics","target":"deepseek_vl_hybrid","label":"2 imports","cand":false},{"source":"idefics","target":"deepseek_vl_hybrid","label":"2 imports","cand":false},{"source":"sam","target":"deepseek_vl_hybrid","label":"2 imports","cand":false},{"source":"sam","target":"deepseek_vl_hybrid","label":"2 imports","cand":false},{"source":"idefics","target":"deepseek_vl","label":"2 imports","cand":false},{"source":"idefics","target":"deepseek_vl","label":"2 imports","cand":false},{"source":"janus","target":"deepseek_vl","label":"3 imports","cand":false},{"source":"janus","target":"deepseek_vl","label":"3 imports","cand":false},{"source":"janus","target":"deepseek_vl","label":"3 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v3","label":"9 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama","target":"deepseek_v2","label":"8 imports","cand":false},{"source":"llama4","target":"deepseek_v2","label":"1 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"roberta","target":"data2vec","label":"7 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"wav2vec2","target":"data2vec","label":"11 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr","target":"d_fine","label":"12 imports","cand":false},{"source":"rt_detr_v2","target":"d_fine","label":"1 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"llama","target":"csm","label":"8 imports","cand":false},{"source":"colpali","target":"colqwen2","label":"2 imports","cand":false},{"source":"colpali","target":"colqwen2","label":"2 imports","cand":false},{"source":"aya_vision","target":"cohere2_vision","label":"4 imports","cand":false},{"source":"aya_vision","target":"cohere2_vision","label":"4 imports","cand":false},{"source":"aya_vision","target":"cohere2_vision","label":"4 imports","cand":false},{"source":"aya_vision","target":"cohere2_vision","label":"4 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"cohere","target":"cohere2","label":"8 imports","cand":false},{"source":"gemma2","target":"cohere2","label":"1 imports","cand":false},{"source":"llama","target":"cohere","label":"6 imports","cand":false},{"source":"llama","target":"cohere","label":"6 imports","cand":false},{"source":"llama","target":"cohere","label":"6 imports","cand":false},{"source":"llama","target":"cohere","label":"6 imports","cand":false},{"source":"llama","target":"cohere","label":"6 imports","cand":false},{"source":"llama","target":"cohere","label":"6 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"roberta","target":"camembert","label":"8 imports","cand":false},{"source":"cohere2","target":"blt","label":"2 imports","cand":false},{"source":"cohere2","target":"blt","label":"2 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"mllama","target":"blt","label":"8 imports","cand":false},{"source":"gemma","target":"bitnet","label":"1 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"llama","target":"bitnet","label":"7 imports","cand":false},{"source":"bart","target":"biogpt","label":"3 imports","cand":false},{"source":"bart","target":"biogpt","label":"3 imports","cand":false},{"source":"bart","target":"biogpt","label":"3 imports","cand":false},{"source":"opt","target":"biogpt","label":"1 imports","cand":false},{"source":"jamba","target":"bamba","label":"2 imports","cand":false},{"source":"jamba","target":"bamba","label":"2 imports","cand":false},{"source":"llama","target":"bamba","label":"6 imports","cand":false},{"source":"llama","target":"bamba","label":"6 imports","cand":false},{"source":"llama","target":"bamba","label":"6 imports","cand":false},{"source":"llama","target":"bamba","label":"6 imports","cand":false},{"source":"llama","target":"bamba","label":"6 imports","cand":false},{"source":"llama","target":"bamba","label":"6 imports","cand":false},{"source":"mamba2","target":"bamba","label":"4 imports","cand":false},{"source":"mamba2","target":"bamba","label":"4 imports","cand":false},{"source":"mamba2","target":"bamba","label":"4 imports","cand":false},{"source":"mamba2","target":"bamba","label":"4 imports","cand":false},{"source":"llava","target":"aya_vision","label":"6 imports","cand":false},{"source":"llava","target":"aya_vision","label":"6 imports","cand":false},{"source":"llava","target":"aya_vision","label":"6 imports","cand":false},{"source":"llava","target":"aya_vision","label":"6 imports","cand":false},{"source":"llava","target":"aya_vision","label":"6 imports","cand":false},{"source":"llava","target":"aya_vision","label":"6 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llama","target":"aria","label":"7 imports","cand":false},{"source":"llava","target":"aria","label":"4 imports","cand":false},{"source":"llava","target":"aria","label":"4 imports","cand":false},{"source":"llava","target":"aria","label":"4 imports","cand":false},{"source":"llava","target":"aria","label":"4 imports","cand":false},{"source":"llama","target":"arcee","label":"4 imports","cand":false},{"source":"llama","target":"arcee","label":"4 imports","cand":false},{"source":"llama","target":"arcee","label":"4 imports","cand":false},{"source":"llama","target":"arcee","label":"4 imports","cand":false},{"source":"nemotron","target":"arcee","label":"1 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"llama","target":"apertus","label":"10 imports","cand":false},{"source":"nemotron","target":"apertus","label":"1 imports","cand":false},{"source":"clip","target":"aimv2","label":"3 imports","cand":false},{"source":"clip","target":"aimv2","label":"3 imports","cand":false},{"source":"clip","target":"aimv2","label":"3 imports","cand":false},{"source":"llama","target":"aimv2","label":"2 imports","cand":false},{"source":"llama","target":"aimv2","label":"2 imports","cand":false},{"source":"siglip","target":"aimv2","label":"3 imports","cand":false},{"source":"siglip","target":"aimv2","label":"3 imports","cand":false},{"source":"siglip","target":"aimv2","label":"3 imports","cand":false},{"source":"altclip","target":"chinese_clip","label":"74.9%","cand":true},{"source":"audio_spectrogram_transformer","target":"vit_msn","label":"66.8%","cand":true},{"source":"audio_spectrogram_transformer","target":"vivit","label":"68.1%","cand":true},{"source":"autoformer","target":"time_series_transformer","label":"72.0%","cand":true},{"source":"bart","target":"blenderbot","label":"77.5%","cand":true},{"source":"bart","target":"blenderbot_small","label":"80.0%","cand":true},{"source":"bart","target":"m2m_100","label":"71.6%","cand":true},{"source":"bart","target":"marian","label":"72.6%","cand":true},{"source":"bart","target":"mbart","label":"87.8%","cand":true},{"source":"bart","target":"mvp","label":"71.0%","cand":true},{"source":"bart","target":"pegasus","label":"75.3%","cand":true},{"source":"bert","target":"bert_generation","label":"72.9%","cand":true},{"source":"bert","target":"electra","label":"73.3%","cand":true},{"source":"bert","target":"megatron_bert","label":"67.1%","cand":true},{"source":"bert","target":"roberta_prelayernorm","label":"78.4%","cand":true},{"source":"bert","target":"roc_bert","label":"77.0%","cand":true},{"source":"bert","target":"xmod","label":"71.2%","cand":true},{"source":"bert_generation","target":"electra","label":"72.9%","cand":true},{"source":"bert_generation","target":"roberta_prelayernorm","label":"75.4%","cand":true},{"source":"bert_generation","target":"roc_bert","label":"68.2%","cand":true},{"source":"bert_generation","target":"xmod","label":"67.9%","cand":true},{"source":"blenderbot","target":"blenderbot_small","label":"89.8%","cand":true},{"source":"blenderbot","target":"m2m_100","label":"77.8%","cand":true},{"source":"blenderbot","target":"marian","label":"78.5%","cand":true},{"source":"blenderbot","target":"mbart","label":"77.6%","cand":true},{"source":"blenderbot","target":"pegasus","label":"84.1%","cand":true},{"source":"blenderbot","target":"pegasus_x","label":"69.9%","cand":true},{"source":"blenderbot_small","target":"m2m_100","label":"80.8%","cand":true},{"source":"blenderbot_small","target":"marian","label":"81.8%","cand":true},{"source":"blenderbot_small","target":"mbart","label":"80.8%","cand":true},{"source":"blenderbot_small","target":"pegasus","label":"87.5%","cand":true},{"source":"blenderbot_small","target":"pegasus_x","label":"71.7%","cand":true},{"source":"blenderbot_small","target":"speech_to_text","label":"66.2%","cand":true},{"source":"blip_2","target":"instructblip","label":"79.7%","cand":true},{"source":"clip","target":"clipseg","label":"69.2%","cand":true},{"source":"clip","target":"x_clip","label":"65.3%","cand":true},{"source":"convnext","target":"convnextv2","label":"83.0%","cand":true},{"source":"deberta","target":"deberta_v2","label":"71.7%","cand":true},{"source":"decision_transformer","target":"imagegpt","label":"66.9%","cand":true},{"source":"deit","target":"vit","label":"79.0%","cand":true},{"source":"deit","target":"vit_msn","label":"73.9%","cand":true},{"source":"deit","target":"vivit","label":"68.6%","cand":true},{"source":"detr","target":"table_transformer","label":"78.0%","cand":true},{"source":"dinov2","target":"vit_msn","label":"68.6%","cand":true},{"source":"donut","target":"swin","label":"79.4%","cand":true},{"source":"donut","target":"swinv2","label":"72.9%","cand":true},{"source":"electra","target":"roberta_prelayernorm","label":"76.7%","cand":true},{"source":"electra","target":"roc_bert","label":"68.7%","cand":true},{"source":"electra","target":"xmod","label":"69.7%","cand":true},{"source":"encoder_decoder","target":"speech_encoder_decoder","label":"69.2%","cand":true},{"source":"encoder_decoder","target":"vision_encoder_decoder","label":"68.0%","cand":true},{"source":"flaubert","target":"xlm","label":"89.5%","cand":true},{"source":"gpt_neox_japanese","target":"persimmon","label":"65.5%","cand":true},{"source":"granitemoe","target":"jetmoe","label":"70.5%","cand":true},{"source":"granitemoe","target":"olmoe","label":"65.1%","cand":true},{"source":"idefics2","target":"idefics3","label":"76.9%","cand":true},{"source":"jetmoe","target":"olmoe","label":"71.2%","cand":true},{"source":"jetmoe","target":"qwen2_moe","label":"66.3%","cand":true},{"source":"layoutlm","target":"markuplm","label":"69.5%","cand":true},{"source":"llava","target":"video_llava","label":"65.2%","cand":true},{"source":"m2m_100","target":"marian","label":"73.5%","cand":true},{"source":"m2m_100","target":"mbart","label":"72.9%","cand":true},{"source":"m2m_100","target":"pegasus","label":"79.7%","cand":true},{"source":"m2m_100","target":"pegasus_x","label":"71.6%","cand":true},{"source":"m2m_100","target":"speech_to_text","label":"74.9%","cand":true},{"source":"marian","target":"mbart","label":"72.9%","cand":true},{"source":"marian","target":"pegasus","label":"83.5%","cand":true},{"source":"marian","target":"pegasus_x","label":"65.0%","cand":true},{"source":"mbart","target":"mvp","label":"69.3%","cand":true},{"source":"mbart","target":"pegasus","label":"76.7%","cand":true},{"source":"megatron_bert","target":"rembert","label":"77.0%","cand":true},{"source":"megatron_bert","target":"roformer","label":"67.6%","cand":true},{"source":"mobilenet_v1","target":"mobilenet_v2","label":"65.6%","cand":true},{"source":"mt5","target":"pop2piano","label":"67.8%","cand":true},{"source":"mt5","target":"t5","label":"90.4%","cand":true},{"source":"mt5","target":"umt5","label":"81.6%","cand":true},{"source":"musicgen","target":"musicgen_melody","label":"82.6%","cand":true},{"source":"nemotron","target":"olmoe","label":"70.0%","cand":true},{"source":"nemotron","target":"qwen2_moe","label":"65.7%","cand":true},{"source":"nemotron","target":"stablelm","label":"73.9%","cand":true},{"source":"olmoe","target":"phimoe","label":"65.3%","cand":true},{"source":"olmoe","target":"qwen2_moe","label":"80.2%","cand":true},{"source":"olmoe","target":"stablelm","label":"68.0%","cand":true},{"source":"pegasus","target":"pegasus_x","label":"72.1%","cand":true},{"source":"pegasus","target":"speech_to_text","label":"65.5%","cand":true},{"source":"persimmon","target":"stablelm","label":"74.2%","cand":true},{"source":"phimoe","target":"qwen2_moe","label":"69.6%","cand":true},{"source":"pix2struct","target":"pop2piano","label":"68.1%","cand":true},{"source":"pix2struct","target":"t5","label":"65.8%","cand":true},{"source":"pix2struct","target":"umt5","label":"65.4%","cand":true},{"source":"pop2piano","target":"t5","label":"70.4%","cand":true},{"source":"pop2piano","target":"umt5","label":"68.3%","cand":true},{"source":"pvt","target":"pvt_v2","label":"65.8%","cand":true},{"source":"regnet","target":"resnet","label":"72.5%","cand":true},{"source":"rembert","target":"roformer","label":"72.7%","cand":true},{"source":"roberta_prelayernorm","target":"roc_bert","label":"73.7%","cand":true},{"source":"roberta_prelayernorm","target":"xmod","label":"80.5%","cand":true},{"source":"roc_bert","target":"xmod","label":"67.6%","cand":true},{"source":"seamless_m4t","target":"seamless_m4t_v2","label":"72.6%","cand":true},{"source":"speech_encoder_decoder","target":"vision_encoder_decoder","label":"83.7%","cand":true},{"source":"swin","target":"swinv2","label":"80.9%","cand":true},{"source":"switch_transformers","target":"umt5","label":"65.1%","cand":true},{"source":"t5","target":"umt5","label":"80.1%","cand":true},{"source":"trocr","target":"xglm","label":"77.9%","cand":true},{"source":"vit","target":"vit_msn","label":"77.5%","cand":true},{"source":"vit","target":"vivit","label":"67.8%","cand":true},{"source":"vit_msn","target":"vivit","label":"74.0%","cand":true}]}; |
| |
| |
| const root = document.documentElement; |
| const W = 3000; const H = 1200; |
| |
| |
| const svg = d3.select('#banner-svg') |
| .attr('viewBox', `0 0 ${W} ${H}`); |
| const defs = svg.append('defs'); |
| |
| |
| const radialGradient = defs.append('radialGradient') |
| .attr('id','fadeEllipse') |
| .attr('cx','50%').attr('cy','50%') |
| .attr('r','75%'); |
| radialGradient.append('stop').attr('offset','0%').attr('stop-color','white').attr('stop-opacity',1); |
| radialGradient.append('stop').attr('offset','30%').attr('stop-color','white').attr('stop-opacity',1); |
| radialGradient.append('stop').attr('offset','50%').attr('stop-color','white').attr('stop-opacity',0.8); |
| radialGradient.append('stop').attr('offset','70%').attr('stop-color','white').attr('stop-opacity',0.3); |
| radialGradient.append('stop').attr('offset','85%').attr('stop-color','white').attr('stop-opacity',0); |
| radialGradient.append('stop').attr('offset','100%').attr('stop-color','white').attr('stop-opacity',0); |
| |
| |
| defs.append('mask').attr('id','edgeMask') |
| .append('rect').attr('width','100%').attr('height','100%') |
| .attr('fill','url(#fadeEllipse)'); |
| |
| const g = svg.append('g').attr('mask','url(#edgeMask)'); |
| |
| |
| const link = g.selectAll('line') |
| .data(graph.links) |
| .join('line') |
| .attr('class', d => d.cand ? 'link cand' : 'link'); |
| |
| const node = g.selectAll('g.node') |
| .data(graph.nodes) |
| .join('g') |
| .attr('class', d => d.id === 'llama' ? `node ${d.cls} llama` : `node ${d.cls}`) |
| .call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd)); |
| |
| |
| const rBase = d => { |
| if (d.id === 'llama') return 40; |
| return (d.cls === 'base' ? 10 : 8) * (d.sz || 1); |
| }; |
| node.append('circle').attr('r', rBase); |
| |
| |
| function shortId(id){ |
| if (id.length <= 10) return id; |
| |
| const parts = id.split(/[-_]/); |
| if (parts.length >= 2){ |
| const a = parts[0], b = parts[parts.length-1]; |
| const s = `${a.slice(0,7)}…${b.slice(-5)}`; |
| return s.length <= 12 ? s : id.slice(0,9) + "…"; |
| } |
| return id.slice(0,9) + "…"; |
| } |
| |
| node.append('text') |
| .attr('class', d => d.id === 'llama' ? 'node-label llama-label' : 'node-label') |
| .attr('dy', d => d.id === 'llama' ? '-0.8em' : '-1.1em') |
| .text(d => shortId(d.id)); |
| |
| |
| const llamaNode = graph.nodes.find(d => d.id === 'llama'); |
| if (llamaNode) { |
| llamaNode.fx = W / 2; |
| llamaNode.fy = H / 2; |
| } |
| |
| |
| node.filter(d => d.id === 'llama').raise(); |
| |
| |
| const sim = d3.forceSimulation(graph.nodes) |
| .force('link', d3.forceLink(graph.links).id(d => d.id).distance(d => d.source.id === 'llama' || d.target.id === 'llama' ? 200 : 150).strength(d => d.source.id === 'llama' || d.target.id === 'llama' ? 0.2 : 0.4)) |
| .force('charge', d3.forceManyBody().strength(d => d.id === 'llama' ? -2000 : -1200)) |
| .force('center', d3.forceCenter(W/2, H/2)) |
| .force('x', d3.forceX((_,i) => (i % 12) / 11 * (W*0.86) + W*0.07).strength(0.12)) |
| .force('y', d3.forceY((_,i) => (Math.floor(i/12)%3) * (H/3) + H/6).strength(0.25)) |
| .force('collide', d => d3.forceCollide(d.id === 'llama' ? rBase(d) + 100 : rBase(d) + 20)); |
| |
| sim.on('tick', () => { |
| link |
| .attr('x1', d=>d.source.x).attr('y1', d=>d.source.y) |
| .attr('x2', d=>d.target.x).attr('y2', d=>d.target.y); |
| node.attr('transform', d=>`translate(${d.x},${d.y})`); |
| }); |
| |
| |
| sim.alphaTarget(0.04); |
| |
| function dragStart(e,d){ if(!e.active) sim.alphaTarget(.3).restart(); d.fx=d.x; d.fy=d.y; } |
| function dragged(e,d){ d.fx=e.x; d.fy=e.y; } |
| function dragEnd(e,d){ if(!e.active) sim.alphaTarget(0.02); d.fx=d.fy=null; } |
| |
| |
| function initializeColorPalettes() { |
| if (window.ColorPalettes) { |
| const colors = window.ColorPalettes.getColors('categorical', 3); |
| if (colors.length >= 3) { |
| document.documentElement.style.setProperty('--palette-categorical-1', colors[0]); |
| document.documentElement.style.setProperty('--palette-categorical-2', colors[1]); |
| document.documentElement.style.setProperty('--palette-categorical-3', colors[2]); |
| } |
| } |
| |
| |
| const bannerContainer = document.querySelector('.banner-container'); |
| if (bannerContainer) { |
| bannerContainer.classList.add('loaded'); |
| } |
| } |
| |
| |
| document.addEventListener('palettes:updated', initializeColorPalettes); |
| |
| |
| document.addEventListener('DOMContentLoaded', () => { |
| |
| setTimeout(initializeColorPalettes, 100); |
| }); |
| |
| |
| window.addEventListener('resize', () => location.reload()); |
| </script> |
| </div></div></figure> <p class="hero-desc" data-astro-cid-bbe6dxrz>A peek into software engineering for the transformers library</p> </div> </section> <header class="meta" aria-label="Article meta information" data-astro-cid-bbe6dxrz> <div class="meta-container" data-astro-cid-bbe6dxrz> <div class="meta-container-cell" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Authors</h3> <div class="authors" data-astro-cid-bbe6dxrz> <span data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/Molbap" data-astro-cid-bbe6dxrz>Pablo Montalvo</a> </span> <span data-astro-cid-bbe6dxrz>, </span> <span data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/Lysandre" data-astro-cid-bbe6dxrz>Lysandre Debut</a> </span> <span data-astro-cid-bbe6dxrz>, </span> <span data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/pcuenq" data-astro-cid-bbe6dxrz>Pedro Cuenca</a> </span> <span data-astro-cid-bbe6dxrz>, </span> <span data-astro-cid-bbe6dxrz> <a href="https://huggingface.co/yonigozlan" data-astro-cid-bbe6dxrz>Yoni Gozlan</a> </span> </div> </div> <div class="meta-container-cell meta-container-cell--affiliations" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Affiliation</h3> <p data-astro-cid-bbe6dxrz> <a href="https://huggingface.co" target="_blank" rel="noopener noreferrer" data-astro-cid-bbe6dxrz> Hugging Face </a> </p> </div> <div class="meta-container-cell meta-container-cell--published" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>Published</h3> <p data-astro-cid-bbe6dxrz>October 2, 2025</p> </div> |
| |
| |
| |
| |
| <div class="meta-container-cell meta-container-cell--pdf" data-astro-cid-bbe6dxrz> <h3 data-astro-cid-bbe6dxrz>PDF</h3> <p data-astro-cid-bbe6dxrz> <a class="button" href="/maintain-the-unmaintainable-1m-python-loc-400-models.pdf" download="maintain-the-unmaintainable-1m-python-loc-400-models.pdf" aria-label="Download PDF maintain-the-unmaintainable-1m-python-loc-400-models.pdf" data-astro-cid-bbe6dxrz> |
| Download PDF |
| </a> </p> </div> </div> </header> <section class="content-grid"> <nav class="table-of-contents" aria-label="Table of Contents" data-auto-collapse="1"> <div class="title">Table of Contents</div> <div id="article-toc-placeholder"></div> </nav> <details class="table-of-contents-mobile"> <summary>Table of Contents</summary> <div id="article-toc-mobile-placeholder"></div> </details> <script> |
| |
| const buildTOC = () => { |
| const holder = document.getElementById('article-toc-placeholder'); |
| const holderMobile = document.getElementById('article-toc-mobile-placeholder'); |
| |
| if (holder) holder.innerHTML = ''; |
| if (holderMobile) holderMobile.innerHTML = ''; |
| const articleRoot = document.querySelector('section.content-grid main'); |
| if (!articleRoot) return; |
| const headings = articleRoot.querySelectorAll('h2, h3, h4'); |
| if (!headings.length) return; |
| |
| |
| const headingsArr = Array.from(headings); |
| if (!headingsArr.length) return; |
| |
| |
| const usedIds = new Set(); |
| const slugify = (s) => String(s || '') |
| .toLowerCase() |
| .trim() |
| .replace(/\s+/g, '_') |
| .replace(/[^a-z0-9_\-]/g, ''); |
| headingsArr.forEach((h) => { |
| let id = (h.id || '').trim(); |
| if (!id) { |
| const base = slugify(h.textContent || ''); |
| id = base || 'section'; |
| } |
| let candidate = id; |
| let n = 2; |
| while (usedIds.has(candidate)) { |
| candidate = `${id}-${n++}`; |
| } |
| if (h.id !== candidate) h.id = candidate; |
| usedIds.add(candidate); |
| }); |
| |
| const nav = document.createElement('nav'); |
| let ulStack = [document.createElement('ul')]; |
| nav.appendChild(ulStack[0]); |
| |
| const levelOf = (tag) => tag === 'H2' ? 2 : tag === 'H3' ? 3 : 4; |
| let prev = 2; |
| let h2Count = -1; |
| const h2List = headingsArr.filter(h => h.tagName === 'H2'); |
| headingsArr.forEach((h) => { |
| const lvl = levelOf(h.tagName); |
| |
| while (lvl > prev) { const ul = document.createElement('ul'); ulStack[ulStack.length-1].lastElementChild?.appendChild(ul); ulStack.push(ul); prev++; } |
| while (lvl < prev) { ulStack.pop(); prev--; } |
| const li = document.createElement('li'); |
| const a = document.createElement('a'); |
| a.href = '#' + h.id; a.textContent = h.textContent; a.target = '_self'; |
| li.appendChild(a); |
| if (lvl === 2) { |
| h2Count += 1; |
| li.setAttribute('data-h2-idx', String(h2Count)); |
| } |
| ulStack[ulStack.length-1].appendChild(li); |
| }); |
| |
| if (holder) holder.appendChild(nav); |
| const navClone = nav.cloneNode(true); |
| if (holderMobile) holderMobile.appendChild(navClone); |
| |
| |
| const links = [ |
| ...(holder ? holder.querySelectorAll('a') : []), |
| ...(holderMobile ? holderMobile.querySelectorAll('a') : []) |
| ]; |
| |
| const getCollapsePx = () => { |
| const root = document.documentElement; |
| const raw = getComputedStyle(root).getPropertyValue('--bp-content-collapse').trim(); |
| return raw || '1100px'; |
| }; |
| const mq = window.matchMedia(`(max-width: ${getCollapsePx()})`); |
| const attrEnabled = (document.querySelector('.table-of-contents')?.getAttribute('data-auto-collapse') === '1'); |
| let autoCollapse = attrEnabled && !mq.matches; |
| |
| |
| const ensureStyles = () => { |
| if (document.getElementById('toc-collapse-style')) return; |
| const style = document.createElement('style'); |
| style.id = 'toc-collapse-style'; |
| style.textContent = ` |
| .table-of-contents nav.table-of-contents-collapsible > ul > li > ul, |
| details.table-of-contents-mobile nav.table-of-contents-collapsible > ul > li > ul { overflow: hidden; transition: height 200ms ease; } |
| .table-of-contents nav.table-of-contents-collapsible > ul > li.collapsed > ul, |
| details.table-of-contents-mobile nav.table-of-contents-collapsible > ul > li.collapsed > ul { display: block; } |
| `; |
| document.head.appendChild(style); |
| }; |
| ensureStyles(); |
| |
| const getTopLevelItems = () => { |
| const sideNav = holder ? holder.querySelector('nav') : null; |
| const mobileNav = holderMobile ? holderMobile.querySelector('nav') : null; |
| const q = (navEl) => navEl ? Array.from(navEl.querySelectorAll(':scope > ul > li[data-h2-idx]')) : []; |
| return { sideNav, mobileNav, sideTop: q(sideNav), mobileTop: q(mobileNav) }; |
| }; |
| |
| const setNavCollapsible = () => { |
| const sideNav = holder ? holder.querySelector('nav') : null; |
| const mobileNav = holderMobile ? holderMobile.querySelector('nav') : null; |
| if (sideNav) sideNav.classList.add('table-of-contents-collapsible'); |
| if (mobileNav) mobileNav.classList.add('table-of-contents-collapsible'); |
| }; |
| |
| const measure = (el) => { |
| if (!el) return 0; |
| |
| const prev = el.style.height; |
| el.style.height = 'auto'; |
| const h = el.scrollHeight; |
| el.style.height = prev || ''; |
| return h; |
| }; |
| |
| const animateTo = (el, target) => { |
| if (!el) return; |
| const current = parseFloat(getComputedStyle(el).height) || 0; |
| if (Math.abs(current - target) < 1) { |
| el.style.height = target ? 'auto' : '0px'; |
| return; |
| } |
| el.style.height = current + 'px'; |
| |
| void el.offsetHeight; |
| el.style.height = target + 'px'; |
| const onEnd = (e) => { |
| if (e.propertyName !== 'height') return; |
| el.removeEventListener('transitionend', onEnd); |
| if (target > 0) el.style.height = 'auto'; |
| }; |
| el.addEventListener('transitionend', onEnd); |
| }; |
| |
| let prevActiveIdx = -1; |
| const setCollapsedState = (activeIdx) => { |
| if (!autoCollapse) return; |
| if (activeIdx == null || activeIdx < 0) activeIdx = 0; |
| const { sideTop, mobileTop } = getTopLevelItems(); |
| const update = (items) => items.forEach((li) => { |
| const idx = Number(li.getAttribute('data-h2-idx') || '-1'); |
| const sub = li.querySelector(':scope > ul'); |
| if (!sub) return; |
| if (idx === activeIdx) { |
| li.classList.remove('collapsed'); |
| const target = measure(sub); |
| animateTo(sub, target); |
| } else { |
| li.classList.add('collapsed'); |
| animateTo(sub, 0); |
| } |
| }); |
| update(sideTop); |
| update(mobileTop); |
| setNavCollapsible(); |
| prevActiveIdx = activeIdx; |
| }; |
| |
| |
| const expandAll = () => { |
| const { sideTop, mobileTop } = getTopLevelItems(); |
| const expand = (items) => items.forEach((li) => { |
| li.classList.remove('collapsed'); |
| const sub = li.querySelector(':scope > ul'); |
| if (sub) sub.style.height = 'auto'; |
| }); |
| expand(sideTop); |
| expand(mobileTop); |
| }; |
| |
| const onMqChange = () => { |
| autoCollapse = attrEnabled && !mq.matches; |
| if (!autoCollapse) { |
| expandAll(); |
| } else { |
| setCollapsedState(prevActiveIdx); |
| } |
| }; |
| if (mq.addEventListener) mq.addEventListener('change', onMqChange); |
| else if (mq.addListener) mq.addListener(onMqChange); |
| |
| const onScroll = () => { |
| |
| let activeIdx = -1; |
| for (let i = headingsArr.length - 1; i >= 0; i--) { |
| const top = headingsArr[i].getBoundingClientRect().top; |
| if (top - 60 <= 0) { |
| links.forEach(l => l.classList.remove('active')); |
| const id = '#' + headingsArr[i].id; |
| const actives = Array.from(links).filter(l => l.getAttribute('href') === id); |
| actives.forEach(a => a.classList.add('active')); |
| if (headingsArr[i].tagName === 'H2') { |
| activeIdx = h2List.indexOf(headingsArr[i]); |
| } else { |
| for (let j = i; j >= 0; j--) { |
| if (headingsArr[j].tagName === 'H2') { activeIdx = h2List.indexOf(headingsArr[j]); break; } |
| } |
| } |
| break; |
| } |
| } |
| if (activeIdx !== prevActiveIdx) setCollapsedState(activeIdx); |
| }; |
| |
| |
| if (autoCollapse) setCollapsedState(0); |
| |
| window.addEventListener('scroll', onScroll); |
| |
| onScroll(); |
| |
| |
| if (holderMobile) { |
| const details = holderMobile.closest('details'); |
| holderMobile.addEventListener('click', (ev) => { |
| const target = ev.target; |
| const anchor = target && 'closest' in target ? target.closest('a') : null; |
| if (anchor instanceof HTMLAnchorElement && details && details.open) { |
| details.open = false; |
| } |
| }); |
| } |
| }; |
| |
| if (document.readyState === 'loading') { |
| document.addEventListener('DOMContentLoaded', buildTOC, { once: true }); |
| } else { buildTOC(); } |
| </script> <main> <h2 id="preface"><a href="#preface">Preface</a></h2> |
| <p>One million lines of <code>python</code> code. Through them, the <code>transformers</code> library supports more than 400 model architectures, from state-of-the-art LLMs and VLMs to specialized models for audio, video, and tables.</p> |
| <p>Built on <code>PyTorch</code>, transformers is a foundational tool for modern LLM usage, research, education, and tens of thousands of other open-source projects. Each AI model is added by the community, harmonized into a consistent interface, and tested daily on a CI to ensure reproducibility.</p> |
| <p>This scale presents a monumental engineering challenge.</p> |
| <p>How do you keep such a ship afloat, made of so many moving, unrelated parts, contributed to by a buzzing hivemind? Especially as the pace of ML research accelerates? We receive constant feedback on everything from function signatures with hundreds of arguments to duplicated code and optimization concerns, and we listen to all of it, or try to. The library’s usage keeps on growing, and we are a small team of maintainers and contributors, backed by hundreds of open-source community members. |
| We continue to support all new models and expect to do so for the foreseeable future.</p> |
| <p>This post dissects the design philosophy that makes this possible. It’s the result of an evolution from our older principles, detailed on our previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, as well as its accompanying <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post from 2022</a>. More recently (and we strongly recommend the read) we published a blog post about <a href="https://huggingface.co/blog/faster-transformers">recent upgrades to transformers</a>, focusing on what makes the library faster today. All of these developments are only made possible thanks to these principles.</p> |
| <p>We formalize and articulate the “tenets” that have been guiding our development, demonstrate how they are implemented in code, and show the measurable impact they have on the library’s sustainability and growth.</p> |
| <p>For any OSS maintainer, power user, or contributor, this is the map to understanding, using, and building upon <code>transformers</code>, but not only: any project of comparable size will require you to make deep choices, not only on design and choice of abstraction, but on the very mindset of the software you are building. These tenets may or may not be applicable to your project, but they provide a glimpse on how we work that could be helpful or inspirational.</p> |
| <p>Conventions used throughout this post:</p> |
| <p><span class="glossary-term " style="" data-tooltip-id="tenet-6ohqvl3xos8" onmouseenter="window.showTenetTooltip(event, 'tenet-6ohqvl3xos8')" onmousemove="window.updateTenetTooltip(event, 'tenet-6ohqvl3xos8')" onmouseleave="window.hideTenetTooltip('tenet-6ohqvl3xos8')">Tenets exemplified</span><span id="tenet-6ohqvl3xos8" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">source-of-truth</span> <span class="glossary-tooltip__definition">Model implementations should be reliable, reproducible, and faithful to original performances.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script> will have their summary available on hover.</p> |
| <p><a href="https://huggingface.co/blog/welcome-openai-gpt-oss">External links</a> to articles will help you solidify your knowledge.</p> |
| <p><a href="#generated-modeling">Several interactive visualisations</a> are available as you go - scroll, zoom, drag away to explore them.</p> |
| <div class="note note--neutral" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Breadcrumb boxes summarize what you just learned, connect it to the tenets, and point to what’s coming <strong>Next</strong>. Think of them as narrative signposts to help you keep track.</p> </div> </div> </div> </div> |
| <p>We will get started by enumerating the tenets. Then we’ll look at concrete examples that show how they shape our decision-making. These examples are necessarily detailed, and sometimes complex, because they illustrate the challenges to maintain and grow a large codebase that caters to multiple collectives, has millions of users, hundreds of contributors, and always strives for simplicity and consistency.</p> |
| <h2 id="the-core-tenets-of-transformers"><a href="#the-core-tenets-of-transformers">The core tenets of transformers</a></h2> |
| <p>We summarize the foundations on which we’ve built everything, and write the “tenets” of the library. They behave like <em>software interfaces</em>, hence it is crucial that they are explicitly written down. However opinionated they are, they have evolved over time.</p> |
| <p>These principles were not decided in a vacuum. The library <em>evolved</em> towards them, and once they <em>emerged</em>, they were recognized as critical.</p> |
| <div class="tenet-list"><ol><li class="tenet"><a id="source-of-truth"></a><strong>Source of Truth</strong><p>We aim to be a <a href="https://huggingface.co/blog/transformers-model-definition">source of truth for all model definitions</a>. This is more of a goal than a tenet, but it strongly guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original implementations. If we are successful, they should become reference baselines for the ecosystem, so they’ll be easily adopted by downstream libraries and projects. It’s much easier for a project to always refer to the transformers implementation, than to learn a different research codebase every time a new architecture is released.</p><em>This overarching guideline ensures quality and reproducibility across all models in the library, and aspires to make the community work easier.</em></li><li class="tenet"><a id="one-model-one-file"></a><strong>One Model, One File</strong><p>All inference and training core logic has to be visible, top‑to‑bottom, to maximize each model’s hackability.</p><em>Every model should be completely understandable and hackable by reading a single file from top to bottom.</em></li><li class="tenet"><a id="code-is-product"></a><strong>Code is Product</strong><p>Optimize for reading, diffing, and tweaking, our users are power users. Variables should be explicit, full words, even several words, readability is primordial.</p><em>Code quality matters as much as functionality - optimize for human readers, not just computers.</em></li><li class="tenet"><a id="standardize-dont-abstract"></a><strong>Standardize, Don’t Abstract</strong><p>If it’s model behavior, keep it in the file; use abstractions only for generic infra.</p><em>Model-specific logic belongs in the model file, not hidden behind abstractions.</em></li><li class="tenet"><a id="do-repeat-yourself"></a><strong>DRY* (DO Repeat Yourself)</strong><p>Copy when it helps users; keep successors in sync without centralizing behavior.</p><p><b>Evolution:</b></p><p> With the introduction and global adoption of <a href="#modular">modular</a> transformers, we do not repeat any logic in the modular files, but end user files remain faithful to the original tenet.</p><em>Strategic duplication can improve readability and maintainability when done thoughtfully.</em></li><li class="tenet"><a id="minimal-user-api"></a><strong>Minimal User API</strong><p>Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths. Reading should be obvious, configurations should be obvious.</p><em>Keep the public interface simple and predictable, users should know what to expect.</em></li><li class="tenet"><a id="backwards-compatibility"></a><strong>Backwards Compatibility</strong><p>Evolve by additive standardization, never break public APIs.</p><p>Any artifact that was once on the hub and loadable with transformers should be usable indefinitely with the same interface. Further, public methods should not change to avoid breaking dependencies. If we do deprecate something, it’s with very long cycles beforehand.</p><em>Once something is public, it stays public, evolution through addition, not breaking changes.</em></li><li class="tenet"><a id="consistent-public-surface"></a><strong>Consistent Public Surface</strong><p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests. This is a goal as well as a tenet.</p><em>All models should feel familiar - consistent interfaces reduce cognitive load.</em></li></ol></div> |
| <p>When a PR is merged, it is because the contribution is worthwhile, and because the <code>transformers</code> team finds the design of the contribution to be aligned with the tenets.</p> |
| <p>Does all the code in the library strictly follow these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere, built by thousands of different workers. We <em>try</em> to make it so all the code added is compliant, because if we fail and merge it, we cannot change it lest we break <span class="glossary-term " style="" data-tooltip-id="tenet-z9lfki7mgu9" onmouseenter="window.showTenetTooltip(event, 'tenet-z9lfki7mgu9')" onmousemove="window.updateTenetTooltip(event, 'tenet-z9lfki7mgu9')" onmouseleave="window.hideTenetTooltip('tenet-z9lfki7mgu9')">backwards compatibility</span><span id="tenet-z9lfki7mgu9" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">backwards-compatibility</span> <span class="glossary-tooltip__definition">Any artifact once on the hub must remain loadable. Breaking changes are unacceptable.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>.</p> |
| <p>To see what constitutes adherence to the tenets, let’s take the example of code repetition.</p> |
| <p>The following function, essential to the implementation of <a href="https://huggingface.co/papers/2104.09864">Rotary Positional Embeddings</a> can be found in more than 70 <code>modeling_<file>.py</code> across <code>src/transformers/models/.</code> Why keep it? Because we want all the model logic to be <span class="glossary-term " style="" data-tooltip-id="tenet-t1hrul1dtgi" onmouseenter="window.showTenetTooltip(event, 'tenet-t1hrul1dtgi')" onmousemove="window.updateTenetTooltip(event, 'tenet-t1hrul1dtgi')" onmouseleave="window.hideTenetTooltip('tenet-t1hrul1dtgi')">contained in the modeling file</span><span id="tenet-t1hrul1dtgi" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. In order to do that, we <span class="glossary-term " style="" data-tooltip-id="tenet-4148jyt1yb5" onmouseenter="window.showTenetTooltip(event, 'tenet-4148jyt1yb5')" onmousemove="window.updateTenetTooltip(event, 'tenet-4148jyt1yb5')" onmouseleave="window.hideTenetTooltip('tenet-4148jyt1yb5')">do repeat ourselves</span><span id="tenet-4148jyt1yb5" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>.</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> rotate_half</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(x):</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> """Rotates half the hidden dims of the input."""</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x1 </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, : x.shape[</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x2 </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> x[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, x.shape[</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> :]</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch.cat((</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">x2, x1), </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dim</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>We want all models to have self-contained modeling code. Every core functionality <em>must</em> be in the modeling code, every non-core functionality <em>can</em> be outside of it.</p> |
| <p>This comes at a great cost. For years, we have used what we call the <code>#Copied from...</code> mechanism: we added comments of a specific format documenting that some code was copied from another model, saving time both for the reviewers and for the CI: we had tooling to ensure that the copied blocks remained in sync.</p> |
| <p>But the LOC count kept creeping up. Each new model copied over hundreds of lines that we considered largely boilerplate, yet, we could not remove them.</p> |
| <p>We needed to separate two principles that were so far intertwined, <span class="glossary-term " style="" data-tooltip-id="tenet-qpqrhtltnf" onmouseenter="window.showTenetTooltip(event, 'tenet-qpqrhtltnf')" onmousemove="window.updateTenetTooltip(event, 'tenet-qpqrhtltnf')" onmouseleave="window.hideTenetTooltip('tenet-qpqrhtltnf')">repetition</span><span id="tenet-qpqrhtltnf" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script> and <span class="glossary-term " style="" data-tooltip-id="tenet-0xs4wsaoaj9" onmouseenter="window.showTenetTooltip(event, 'tenet-0xs4wsaoaj9')" onmousemove="window.updateTenetTooltip(event, 'tenet-0xs4wsaoaj9')" onmouseleave="window.hideTenetTooltip('tenet-0xs4wsaoaj9')">hackability</span><span id="tenet-0xs4wsaoaj9" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>.</p> |
| <p>What was the solution to this? Let’s talk about modular transformers.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p><strong>TL;DR:</strong> Read the code in one place, <span class="glossary-term " style="" data-tooltip-id="tenet-o495mlt3n1b" onmouseenter="window.showTenetTooltip(event, 'tenet-o495mlt3n1b')" onmousemove="window.updateTenetTooltip(event, 'tenet-o495mlt3n1b')" onmouseleave="window.hideTenetTooltip('tenet-o495mlt3n1b')">one model, one file.</span><span id="tenet-o495mlt3n1b" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. Keep semantics local (<a href="#standardize-dont-abstract">Standardize, Don’t Abstract</a>). Allow strategic duplication for end users (<a href="#do-repeat-yourself">DRY*</a>). Keep the public surface minimal and stable (<a href="#minimal-user-api">Minimal API</a>, <a href="#backwards-compatibility">Backwards Compatibility</a>, <a href="#consistent-public-surface">Consistent Surface</a>).</p><p><strong>Next:</strong> how modular transformers honor these while removing boilerplate.</p> </div> </div> </div> </div> |
| <h2 id="-modular-transformers"><a href="#-modular-transformers"><a id="modular"></a> Modular transformers</a></h2> |
| <p>Transformers is an opinionated library. The previous <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, and the <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post</a> were already pointing at the drawbacks mentioned just above, which have been iteratively addressed. <a href="https://huggingface.co/docs/transformers/en/modular_transformers"><code>modular</code> transformers was introduced</a> to allow a form of inheritance without breaking <span class="glossary-term " style="" data-tooltip-id="tenet-k0n3mginvbr" onmouseenter="window.showTenetTooltip(event, 'tenet-k0n3mginvbr')" onmousemove="window.updateTenetTooltip(event, 'tenet-k0n3mginvbr')" onmouseleave="window.hideTenetTooltip('tenet-k0n3mginvbr')">the one model, one file rule.</span><span id="tenet-k0n3mginvbr" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script></p> |
| <p>We amended the principle of <span class="glossary-term " style="" data-tooltip-id="tenet-5p3255optgb" onmouseenter="window.showTenetTooltip(event, 'tenet-5p3255optgb')" onmousemove="window.updateTenetTooltip(event, 'tenet-5p3255optgb')" onmouseleave="window.hideTenetTooltip('tenet-5p3255optgb')">DRY*</span><span id="tenet-5p3255optgb" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script> by progressively removing all pieces of code that were “copied from” another file.</p> |
| <p>It works as follows. In order to contribute a model, <code>GLM</code> for instance, we define a <code>modular_</code> file that can inherit from <em>any function across all other modeling, configuration and processor files</em> already existing in the library. |
| The modular file can use inheritance across models: and then, it will be unravelled into a fully functional modeling file.</p> |
| <div class="reference-wrapper" data-astro-cid-e5g6tzce> <figure class="reference" data-astro-cid-e5g6tzce> <div class="reference__content" data-astro-cid-e5g6tzce> <div class="wide"> <div class="stack " data-layout="2-column" data-gap="medium" style="gap: 1rem" data-astro-cid-rlzglfcb> <div><h4 id="modular_glmpy"><a href="#modular_glmpy">modular_glm.py</a></h4><div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmMLP</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Phi3MLP</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> pass</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmAttention</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">LlamaAttention</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> __init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(self, config, layer_idx</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> super</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">().</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">__init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(config, layer_idx)</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.o_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_attention_heads </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">False</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmForCausalLM</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">LlamaForCausalLM</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> pass</span></span> |
| <span class="line"></span></code></pre></div></div><div style="max-height: 400px; overflow-y: auto !important; overflow-x: hidden;"><h4 id="modeling_glmpy-auto-expanded"><a href="#modeling_glmpy-auto-expanded">modeling_glm.py (auto-expanded)</a></h4><div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmMLP</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> __init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(self, config):</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> super</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">().</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">__init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">()</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.gate_up_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> 2</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> *</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.intermediate_size, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">False</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.down_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.intermediate_size, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">False</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.activation_fn </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ACT2FN</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">[config.hidden_act]</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> forward</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> up_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.gate_up_proj(hidden_states)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> gate, up_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> up_states.chunk(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dim</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> up_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> up_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.activation_fn(gate)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.down_proj(up_states)</span></span> |
| <span class="line"></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmAttention</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> """Multi-headed attention from 'Attention Is All You Need' paper"""</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> __init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(self, config: GlmConfig, layer_idx: Optional[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">int</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> super</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">().</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">__init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">()</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.layer_idx </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> layer_idx</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> getattr</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config, </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"head_dim"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_attention_heads</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.num_key_value_groups </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> (</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_attention_heads </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">//</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_key_value_heads</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.scaling </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">**-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">0.5</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.attention_dropout </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.attention_dropout</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.is_causal </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> True</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.q_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_attention_heads </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">config.attention_bias</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.k_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_key_value_heads </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">config.attention_bias</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.v_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_key_value_heads </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">config.attention_bias</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.o_proj </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Linear(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.num_attention_heads </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim, </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> config.hidden_size, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> bias</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">False</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> forward</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> self,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states: torch.Tensor,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> position_embeddings: Tuple[torch.Tensor, torch.Tensor],</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_mask: Optional[torch.Tensor],</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> past_key_value: Optional[Cache] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> cache_position: Optional[torch.LongTensor] </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> **</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">kwargs: Unpack[FlashAttentionKwargs],</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> input_shape </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states.shape[:</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_shape </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> (</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">input_shape, </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.head_dim)</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> query_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.q_proj(hidden_states).view(hidden_shape).transpose(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> key_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.k_proj(hidden_states).view(hidden_shape).transpose(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> value_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.v_proj(hidden_states).view(hidden_shape).transpose(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> cos, sin </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> position_embeddings</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> query_states, key_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> apply_rotary_pos_emb(query_states, key_states, cos, sin)</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> if</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> past_key_value </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">is</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> not</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> cache_kwargs </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> {</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"sin"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: sin, </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"cos"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: cos, </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"cache_position"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: cache_position}</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> key_states, value_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> past_key_value.update(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> key_states, value_states, </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.layer_idx, cache_kwargs</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_interface: Callable </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> eager_attention_forward</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> if</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">!=</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "eager"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_interface </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ALL_ATTENTION_FUNCTIONS</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation]</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attn_output, attn_weights </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_interface(</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, query_states, key_states, value_states,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_mask, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dropout</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">0.0</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> if</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> not</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.training </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">else</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.attention_dropout,</span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> scaling</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.scaling, </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">**</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">kwargs,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attn_output </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attn_output.reshape(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">input_shape, </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">).contiguous()</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attn_output </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.o_proj(attn_output)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attn_output, attn_weights</span></span> |
| <span class="line"></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">@use_kernel_forward_from_hub(</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"RMSNorm"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmRMSNorm</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> __init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(self, hidden_size, eps</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1e-6</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> super</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">().</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">__init__</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">()</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.weight </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> nn.Parameter(torch.ones(hidden_size))</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.variance_epsilon </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> eps</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> forward</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(self, hidden_states):</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> input_dtype </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states.dtype</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states.to(torch.float32)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> variance </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states.pow(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">2</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">).mean(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">keepdim</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">True</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch.rsqrt(variance </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">+</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.variance_epsilon)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.weight </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">*</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> hidden_states.to(input_dtype)</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"># ... (many more classes and functions would follow)</span></span> |
| <span class="line"></span></code></pre></div></div> </div> </div> </div> <figcaption class="reference__caption" data-astro-cid-e5g6tzce><strong>Left:</strong> Clean modular definition with inheritance. <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.</figcaption> </figure> </div> |
| <p>As you can see, we can define a new model as a <em>modular</em> combination of fragments taken from others.</p> |
| <p>You might think “well that’s just how inheritance works”. The crucial difference is that we do <em>visibly</em> what is essentially the <em>compiler</em>’s job: by unrolling the inheritances, we make visible all of the modeling code, keeping it <span class="glossary-term " style="" data-tooltip-id="tenet-i6e4nxtldh" onmouseenter="window.showTenetTooltip(event, 'tenet-i6e4nxtldh')" onmousemove="window.updateTenetTooltip(event, 'tenet-i6e4nxtldh')" onmouseleave="window.hideTenetTooltip('tenet-i6e4nxtldh')">all in one piece.</span><span id="tenet-i6e4nxtldh" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script></p> |
| <p>You can see below the difference between <code>GlmAttention</code> and <code>LlamaAttention</code>, with the latter having been copied with minimal changes.</p> |
| <div class="wide"> <div class="ri-root" data-ri-root="ri_umnlvyp97n" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/llama_glm_attn.D7pkKjAT_1axKuC.webp" alt="Llama vs GLM" data-zoomable="1" data-astro-cid-6kov3kig width="2169" height="482" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 1:</strong> Comparison of attention implementations between Llama and GLM, showing code reuse with minimal modifications.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> </div> |
| <p>What is the consequence? When adding a model, we do not need to go over the entire modeling file. The modular (left side above) is enough.</p> |
| <p>When <code>AutoModel.from_pretrained(...)</code> is called, it is indeed the modeling (right side) that is ran, and all the tests run on the modeling code.</p> |
| <p>More importantly, the auto-generated modeling file is what users <em>read</em> to understand the code, what they step through in their debuggers and what they hack for their needs.</p> |
| <p>What does that give us?</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p><strong>TL;DR:</strong> A small <code>modular_*.py</code> declares reuse; the expanded modeling file stays visible and <span class="glossary-term " style="" data-tooltip-id="tenet-qteoxaap8ie" onmouseenter="window.showTenetTooltip(event, 'tenet-qteoxaap8ie')" onmousemove="window.updateTenetTooltip(event, 'tenet-qteoxaap8ie')" onmouseleave="window.hideTenetTooltip('tenet-qteoxaap8ie')">unique</span><span id="tenet-qteoxaap8ie" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. Reviewers and contributors maintain the shard, not the repetition.</p><p><strong>Next:</strong> the measurable effect on effective LOC and maintenance cost.</p> </div> </div> </div> </div> |
| <h3 id="a-maintainable-control-surface"><a href="#a-maintainable-control-surface">A maintainable control surface</a></h3> |
| <p>The effect of modular can be measured in lines of code (LOC). If a model only has a modeling file, we add its LOC count. |
| However, if a model has a modular_<em>.py and a corresponding automatically generated modeling_</em>/.py, we only count the LOC under the modular file. The modeling code has no maintenance cost as it is strictly dependent on the modular file.</p> |
| <p>That gives an “effective LOC” curve: the 𝗺𝗮𝗶𝗻𝘁𝗲𝗻𝗮𝗻𝗰𝗲 𝘀𝘂𝗿𝗳𝗮𝗰𝗲.</p> |
| <p>Measured on git history, raw <code>modeling_*.py</code> grew at ~362 LOC/day before modular; counting only modular shards yields ~25 LOC/day after — about <strong>15× lower</strong>. The effective curve (blue line below) represents the <strong>maintenance surface</strong> today: what maintainers actually read and review.</p> |
| <p>Less code to hand-maintain means fewer places to break. Naturally LOC is not a direct measure of complexity, but they correlate in review effort and change risk.</p> |
| <figure class="html-embed"><div class="html-embed__card"><div id="frag-aquhfzwnfz9"><div class="d3-loc-growth"></div> |
| <style> |
| .d3-loc-growth { position: relative; } |
| |
| .d3-loc-growth .controls { |
| margin-top: 0; |
| display: flex; |
| gap: 16px; |
| align-items: center; |
| justify-content: flex-end; |
| width: auto; |
| flex-wrap: wrap; |
| } |
| |
| .d3-loc-growth .controls .control-group { |
| display: flex; |
| flex-direction: column; |
| align-items: flex-start; |
| gap: 6px; |
| } |
| |
| .d3-loc-growth .controls label { |
| font-size: 12px; |
| color: var(--text-color); |
| display: flex; |
| align-items: center; |
| gap: 6px; |
| white-space: nowrap; |
| font-weight: 700; |
| } |
| |
| .d3-loc-growth .controls select { |
| font-size: 12px; |
| padding: 8px 28px 8px 10px; |
| border: 1px solid var(--border-color); |
| border-radius: 8px; |
| background-color: var(--surface-bg); |
| color: var(--text-color); |
| background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%230f1115' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'/%3E%3C/svg%3E"); |
| background-repeat: no-repeat; |
| background-position: right 8px center; |
| background-size: 12px; |
| -webkit-appearance: none; |
| appearance: none; |
| cursor: pointer; |
| transition: border-color .15s ease, box-shadow .15s ease; |
| } |
| |
| [data-theme="dark"] .d3-loc-growth .controls select { |
| background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%23ffffff' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'/%3E%3C/svg%3E"); |
| } |
| |
| .d3-loc-growth .controls select:hover { border-color: var(--primary-color); } |
| .d3-loc-growth .controls select:focus { border-color: var(--primary-color); box-shadow: 0 0 0 3px rgba(232,137,171,.25); outline: none; } |
| |
| .d3-loc-growth .axis-label { fill: var(--text-color); font-size: 12px; font-weight: 700; } |
| .d3-loc-growth .axes path, .d3-loc-growth .axes line { stroke: var(--axis-color); } |
| .d3-loc-growth .axes text { fill: var(--tick-color); } |
| .d3-loc-growth .grid line { stroke: var(--grid-color); } |
| |
| .d3-loc-growth .legend { font-size: 12px; color: var(--text-color); padding-left: 6px; } |
| .d3-loc-growth .legend .items { display: flex; flex-wrap: wrap; gap: 8px 12px; align-items: center; } |
| .d3-loc-growth .legend .item { display: flex; align-items: center; gap: 6px; white-space: nowrap; } |
| .d3-loc-growth .legend .swatch { width: 14px; height: 14px; border-radius: 3px; border: 1px solid var(--border-color); display: inline-block; } |
| |
| |
| .d3-loc-growth.hovering .legend-bottom .item.ghost { opacity: .35; } |
| .d3-loc-growth.hovering .lines path.ghost { opacity: .25; } |
| .d3-loc-growth.hovering .points circle.ghost { opacity: .25; } |
| |
| .d3-loc-growth .chart-header { display: flex; align-items: center; justify-content: space-between; gap: 12px; margin: 0 0 8px 0; flex-wrap: wrap; } |
| .d3-loc-growth .legend-bottom { display: flex; align-items: center; justify-content: flex-start; font-size: 12px; color: var(--text-color); } |
| .d3-loc-growth .legend-bottom .items { display: flex; flex-wrap: wrap; gap: 8px 14px; } |
| .d3-loc-growth .legend-bottom .item { display: inline-flex; align-items: center; gap: 6px; white-space: nowrap; } |
| .d3-loc-growth .legend-bottom .swatch { width: 14px; height: 14px; border-radius: 3px; border: 1px solid var(--border-color); display: inline-block; } |
| .d3-loc-growth .lines path.active { stroke-width: 3; } |
| |
| |
| .d3-loc-growth .controls .control-group { |
| display: flex; |
| flex-direction: column; |
| align-items: flex-start; |
| gap: 6px; |
| } |
| .d3-loc-growth .legend-bottom { |
| flex-direction: column; |
| align-items: flex-start; |
| gap: 6px; |
| } |
| .d3-loc-growth .legend-bottom .legend-title { |
| font-size: 12px; |
| font-weight: 700; |
| color: var(--text-color); |
| } |
| |
| |
| .d3-loc-growth .d3-tooltip { z-index: var(--z-elevated); backdrop-filter: saturate(1.12) blur(8px); } |
| .d3-loc-growth .d3-tooltip__inner { display: flex; flex-direction: column; gap: 6px; min-width: 220px; } |
| .d3-loc-growth .d3-tooltip__inner > div:first-child { font-weight: 800; letter-spacing: 0.1px; margin-bottom: 0; } |
| .d3-loc-growth .d3-tooltip__inner > div:nth-child(2) { font-size: 11px; color: var(--muted-color); display: block; margin-top: -4px; margin-bottom: 2px; letter-spacing: 0.1px; } |
| .d3-loc-growth .d3-tooltip__inner > div:nth-child(n+3) { padding-top: 6px; border-top: 1px solid var(--border-color); } |
| .d3-loc-growth .d3-tooltip__color-dot { display: inline-block; width: 12px; height: 12px; border-radius: 3px; border: 1px solid var(--border-color); } |
| |
| |
| .d3-loc-growth .chart-card { background: var(--surface-bg); border: 1px solid var(--border-color); border-radius: 10px; padding: 8px; } |
| |
| .d3-loc-growth .chart-header { display: flex; align-items: flex-start; justify-content: flex-start; gap: 12px; margin: 8px 0 0 0; flex-wrap: wrap; } |
| </style> |
| <script> |
| (() => { |
| const ensureD3 = (cb) => { |
| if (window.d3 && typeof window.d3.select === 'function') return cb(); |
| let s = document.getElementById('d3-cdn-script'); |
| if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); } |
| const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); }; |
| s.addEventListener('load', onReady, { once: true }); if (window.d3) onReady(); |
| }; |
| |
| const bootstrap = () => { |
| const scriptEl = document.currentScript; |
| let container = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!(container && container.classList && container.classList.contains('d3-loc-growth'))){ |
| const cs = Array.from(document.querySelectorAll('.d3-loc-growth')).filter(el => !(el.dataset && el.dataset.mounted === 'true')); |
| container = cs[cs.length - 1] || null; |
| } |
| if (!container) return; |
| if (container.dataset) { if (container.dataset.mounted === 'true') return; container.dataset.mounted = 'true'; } |
| |
| |
| |
| |
| container.style.position = container.style.position || 'relative'; |
| let tip = container.querySelector('.d3-tooltip'); let tipInner; |
| if (!tip) { |
| tip = document.createElement('div'); tip.className = 'd3-tooltip'; |
| Object.assign(tip.style, { |
| position:'absolute', top:'0px', left:'0px', transform:'translate(-9999px, -9999px)', pointerEvents:'none', |
| padding:'8px 10px', borderRadius:'8px', fontSize:'12px', lineHeight:'1.35', border:'1px solid var(--border-color)', |
| background:'var(--surface-bg)', color:'var(--text-color)', boxShadow:'0 4px 24px rgba(0,0,0,.18)', opacity:'0', transition:'opacity .12s ease' |
| }); |
| tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tipInner.style.textAlign='left'; tip.appendChild(tipInner); container.appendChild(tip); |
| } else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; } |
| |
| |
| const header = document.createElement('div'); header.className = 'chart-header'; |
| const legendBottom = document.createElement('div'); legendBottom.className = 'legend-bottom'; header.appendChild(legendBottom); |
| |
| |
| const card = document.createElement('div'); card.className = 'chart-card'; container.appendChild(card); |
| container.appendChild(header); |
| |
| const svg = d3.select(card).append('svg').attr('width','100%').style('display','block'); |
| const gRoot = svg.append('g'); |
| const gGrid = gRoot.append('g').attr('class','grid'); |
| const gAxes = gRoot.append('g').attr('class','axes'); |
| const gLines = gRoot.append('g').attr('class','lines'); |
| const gPoints = gRoot.append('g').attr('class','points'); |
| const overlay = gRoot.append('rect').attr('fill','transparent').style('cursor','crosshair'); |
| const hoverLine = gRoot.append('line').attr('stroke-width',1).style('display','none'); |
| |
| |
| let width = 800, height = 480; const margin = { top: 16, right: 32, bottom: 44, left: 56 }; |
| const xScale = d3.scaleTime(); |
| const yScale = d3.scaleLinear(); |
| const lineGen = d3.line().x(d => xScale(d.date)).y(d => yScale(d.value)); |
| const dataByMetric = new Map(); |
| let runOrder = []; |
| |
| |
| function getRunColors(count){ |
| |
| return ['#4e79a7', '#59a14f', '#f28e2b', '#e15759']; |
| } |
| |
| |
| function formatK(v){ |
| const abs = Math.abs(v); |
| if (abs >= 1000) { |
| const n = v / 1000; |
| const s = d3.format('.1f')(n); |
| return (s.endsWith('.0') ? s.slice(0, -2) : s) + 'k'; |
| } |
| return d3.format('d')(v); |
| } |
| |
| |
| const sampleData = [ |
| |
| { date: new Date('2020-11-16'), effective: 65496, modular: 0, modelingAll: 65496, modelingIncluded: 65496 }, |
| { date: new Date('2021-01-12'), effective: 93990, modular: 0, modelingAll: 93990, modelingIncluded: 93990 }, |
| { date: new Date('2021-06-14'), effective: 139949, modular: 0, modelingAll: 139949, modelingIncluded: 139949 }, |
| { date: new Date('2021-12-06'), effective: 194731, modular: 0, modelingAll: 194731, modelingIncluded: 194731 }, |
| |
| |
| { date: new Date('2022-01-10'), effective: 204653, modular: 0, modelingAll: 204653, modelingIncluded: 204653 }, |
| { date: new Date('2022-06-30'), effective: 266522, modular: 0, modelingAll: 266522, modelingIncluded: 266522 }, |
| { date: new Date('2022-12-17'), effective: 200853, modular: 0, modelingAll: 200853, modelingIncluded: 200853 }, |
| |
| |
| { date: new Date('2023-01-10'), effective: 201000, modular: 0, modelingAll: 201000, modelingIncluded: 201000 }, |
| { date: new Date('2023-06-01'), effective: 250000, modular: 0, modelingAll: 250000, modelingIncluded: 250000 }, |
| { date: new Date('2023-12-01'), effective: 300000, modular: 0, modelingAll: 300000, modelingIncluded: 300000 }, |
| |
| |
| { date: new Date('2024-01-01'), effective: 320000, modular: 0, modelingAll: 320000, modelingIncluded: 320000 }, |
| { date: new Date('2024-05-31'), effective: 520764, modular: 0, modelingAll: 520764, modelingIncluded: 520764 }, |
| |
| |
| { date: new Date('2024-09-24'), effective: 546778, modular: 2896, modelingAll: 550264, modelingIncluded: 543882 }, |
| { date: new Date('2024-12-20'), effective: 549627, modular: 10536, modelingAll: 562826, modelingIncluded: 539091 }, |
| |
| |
| { date: new Date('2025-01-08'), effective: 548716, modular: 12024, modelingAll: 567397, modelingIncluded: 536692 }, |
| { date: new Date('2025-03-31'), effective: 562251, modular: 25759, modelingAll: 596317, modelingIncluded: 536492 }, |
| { date: new Date('2025-05-14'), effective: 537351, modular: 42516, modelingAll: 591941, modelingIncluded: 494835 }, |
| { date: new Date('2025-07-25'), effective: 538722, modular: 62444, modelingAll: 593856, modelingIncluded: 476278 }, |
| { date: new Date('2025-09-18'), effective: 407744, modular: 80421, modelingAll: 465464, modelingIncluded: 327323 }, |
| { date: new Date('2025-09-24'), effective: 408954, modular: 88852, modelingAll: 471514, modelingIncluded: 320102 }, |
| ]; |
| |
| |
| runOrder = ['effective', 'modular', 'modelingAll', 'modelingIncluded']; |
| const metricLabels = { |
| 'effective': 'Effective LOC', |
| 'modular': 'Modular LOC', |
| 'modelingAll': 'Modeling LOC (all)', |
| 'modelingIncluded': 'Modeling LOC (included)' |
| }; |
| |
| |
| const dataMap = {}; |
| runOrder.forEach(run => { |
| dataMap[run] = sampleData.map(d => ({ |
| date: d.date, |
| value: d[run] || 0 |
| })); |
| }); |
| |
| function updateLayout(){ |
| const axisColor = getComputedStyle(container).getPropertyValue('--axis-color').trim() || 'rgba(0,0,0,0.25)'; |
| width = container.clientWidth || 800; |
| height = Math.max(280, Math.round(width / 3)); |
| svg.attr('width', width).attr('height', height); |
| gRoot.attr('transform', `translate(${margin.left},${margin.top})`); |
| const innerWidth = width - margin.left - margin.right; |
| const innerHeight = height - margin.top - margin.bottom; |
| overlay.attr('x',0).attr('y',0).attr('width', innerWidth).attr('height', innerHeight); |
| hoverLine.attr('y1',0).attr('y2', innerHeight).attr('stroke', axisColor); |
| return { innerWidth, innerHeight }; |
| } |
| |
| function render(){ |
| const { innerWidth, innerHeight } = updateLayout(); |
| const map = dataMap; |
| const runs = runOrder; |
| |
| |
| let minDate = Infinity, maxDate = -Infinity, minV = Infinity, maxV = -Infinity; |
| runs.forEach(r => { |
| (map[r]||[]).forEach(pt => { |
| minDate = Math.min(minDate, pt.date); |
| maxDate = Math.max(maxDate, pt.date); |
| minV = Math.min(minV, pt.value); |
| maxV = Math.max(maxV, pt.value); |
| }); |
| }); |
| if (!isFinite(minDate) || !isFinite(maxDate)) return; |
| |
| xScale.domain([minDate, maxDate]).range([0, innerWidth]); |
| yScale.domain([minV, maxV]).nice().range([innerHeight, 0]); |
| |
| |
| gGrid.selectAll('*').remove(); |
| gGrid.selectAll('line').data(yScale.ticks(6)).join('line') |
| .attr('x1',0).attr('x2', innerWidth).attr('y1', d=>yScale(d)).attr('y2', d=>yScale(d)) |
| .attr('stroke','var(--grid-color)').attr('stroke-width',1).attr('shape-rendering','crispEdges'); |
| |
| |
| gAxes.selectAll('*').remove(); |
| gAxes.append('g').attr('transform', `translate(0,${innerHeight})`).call(d3.axisBottom(xScale).ticks(8).tickFormat(d3.timeFormat('%Y'))).call(g=>{ g.selectAll('path, line').attr('stroke','var(--axis-color)'); g.selectAll('text').attr('fill','var(--tick-color)').style('font-size','12px'); }); |
| gAxes.append('g').call(d3.axisLeft(yScale).ticks(6).tickFormat(formatK)).call(g=>{ g.selectAll('path, line').attr('stroke','var(--axis-color)'); g.selectAll('text').attr('fill','var(--tick-color)').style('font-size','12px'); }); |
| gAxes.append('text').attr('class','axis-label').attr('text-anchor','middle').attr('x', innerWidth/2).attr('y', innerHeight + 38).text('Year'); |
| gAxes.append('text').attr('class','axis-label').attr('text-anchor','middle').attr('transform', `translate(${-44}, ${innerHeight/2}) rotate(-90)`).text('Lines of Code'); |
| |
| |
| const modularDate = new Date('2024-05-31'); |
| if (modularDate >= minDate && modularDate <= maxDate) { |
| const xModular = xScale(modularDate); |
| gAxes.append('line') |
| .attr('x1', xModular).attr('x2', xModular) |
| .attr('y1', 0).attr('y2', innerHeight) |
| .attr('stroke', '#ff0000') |
| .attr('stroke-width', 2) |
| .attr('stroke-dasharray', '5,5') |
| .style('opacity', 0.8); |
| |
| gAxes.append('text') |
| .attr('x', xModular) |
| .attr('y', -8) |
| .attr('text-anchor', 'middle') |
| .attr('fill', '#ff0000') |
| .attr('font-size', '10px') |
| .attr('font-weight', 'bold') |
| .text('2024-05-31 modular'); |
| } |
| |
| |
| const colors = getRunColors(runs.length); |
| const series = runs.map((r, i) => ({ |
| run: r, |
| color: colors[i % colors.length], |
| values: (map[r]||[]).slice().sort((a,b)=>a.date-b.date) |
| })); |
| |
| const paths = gLines.selectAll('path.run').data(series, d=>d.run); |
| const pathsEnter = paths.enter().append('path').attr('class','run').attr('fill','none').attr('stroke-width',2).attr('stroke', d=>d.color).attr('d', d=>lineGen(d.values)); |
| pathsEnter.merge(paths).transition().duration(200).attr('stroke', d=>d.color).attr('d', d=>lineGen(d.values)); |
| paths.exit().remove(); |
| |
| |
| const captures = gLines.selectAll('path.run-hover').data(series, d=>`cap-${d.run}`); |
| captures.enter().append('path').attr('class','run-hover').attr('fill','none').attr('stroke','transparent').attr('stroke-width', 12).style('pointer-events','stroke') |
| .attr('d', d=>lineGen(d.values)) |
| .merge(captures) |
| .attr('d', d=>lineGen(d.values)) |
| .on('mouseenter', function(ev, d){ |
| container.classList.add('hovering'); |
| |
| gLines.selectAll('path.run').classed('ghost', s => s.run !== d.run); |
| gPoints.selectAll('circle.pt').classed('ghost', p => p.run !== d.run); |
| |
| try { |
| const legendNode = legendBottom; |
| if (legendNode) { |
| legendNode.querySelectorAll('.item').forEach(el => { |
| const name = el.getAttribute('data-run'); |
| el.classList.toggle('ghost', name !== d.run); |
| }); |
| } |
| } catch {} |
| }) |
| .on('mouseleave', function(){ |
| container.classList.remove('hovering'); |
| gLines.selectAll('path.run').classed('ghost', false); |
| gPoints.selectAll('circle.pt').classed('ghost', false); |
| try { const legendNode = legendBottom; if (legendNode) legendNode.querySelectorAll('.item').forEach(el => el.classList.remove('ghost')); } catch {} |
| }); |
| captures.exit().remove(); |
| |
| |
| const allPts = series.flatMap(s => s.values.map(v => ({ run:s.run, color:s.color, date:v.date, value:v.value }))); |
| const ptsSel = gPoints.selectAll('circle.pt').data(allPts, d=>`${d.run}-${d.date.getTime()}`); |
| ptsSel.enter().append('circle').attr('class','pt').attr('r', 2).attr('fill', d=>d.color).attr('fill-opacity', 0.6) |
| .attr('cx', d=>xScale(d.date)).attr('cy', d=>yScale(d.value)) |
| .merge(ptsSel).transition().duration(150).attr('cx', d=>xScale(d.date)).attr('cy', d=>yScale(d.value)); |
| ptsSel.exit().remove(); |
| |
| |
| legendBottom.innerHTML = `<div class="legend-title">Legend</div><div class="items">${series.map(s => `<span class="item" data-run="${s.run}"><span class="swatch" style="background:${s.color}"></span><span>${s.run}</span></span>`).join('')}</div>`; |
| |
| |
| try { |
| const legendNode = legendBottom; |
| legendNode.querySelectorAll('.item').forEach(el => { |
| el.addEventListener('mouseenter', () => { |
| const run = el.getAttribute('data-run'); if (!run) return; |
| container.classList.add('hovering'); |
| gLines.selectAll('path.run').classed('ghost', s => s.run !== run); |
| gPoints.selectAll('circle.pt').classed('ghost', p => p.run !== run); |
| legendNode.querySelectorAll('.item').forEach(it => it.classList.toggle('ghost', it.getAttribute('data-run') !== run)); |
| }); |
| el.addEventListener('mouseleave', () => { |
| container.classList.remove('hovering'); |
| gLines.selectAll('path.run').classed('ghost', false); |
| gPoints.selectAll('circle.pt').classed('ghost', false); |
| legendNode.querySelectorAll('.item').forEach(it => it.classList.remove('ghost')); |
| }); |
| }); |
| } catch {} |
| |
| |
| function onMove(ev){ |
| const [mx, my] = d3.pointer(ev, overlay.node()); |
| const sx = xScale.invert(mx); |
| |
| const dates = Array.from(new Set(allPts.map(p=>p.date))).sort((a,b)=>a-b); |
| const nearest = dates.reduce((best, d) => Math.abs(d - sx) < Math.abs(best - sx) ? d : best, dates[0]); |
| const xpx = xScale(nearest); |
| hoverLine.style('display', null).attr('x1', xpx).attr('x2', xpx); |
| |
| let html = `<div style="font-weight:800;letter-spacing:.1px;">Lines of Code Growth</div><div style="font-size:11px;color:var(--muted-color);margin-top:-4px;margin-bottom:2px;">${d3.timeFormat('%B %Y')(nearest)}</div>`; |
| const entries = series.map(s => { |
| const m = new Map(s.values.map(v=>[v.date.getTime(), v.value])); |
| const val = m.get(nearest.getTime()); |
| return { run: s.run, color: s.color, val }; |
| }).filter(e => e.val != null); |
| entries.sort((a, b) => a.val - b.val); |
| entries.forEach(e => { |
| html += `<div style="display:flex;align-items:center;gap:6px;white-space:nowrap;"><span class="d3-tooltip__color-dot" style="background:${e.color}"></span><strong>${e.run}</strong><span style="margin-left:auto;">${formatK(+e.val)}</span></div>`; |
| }); |
| tipInner.innerHTML = html; tip.style.opacity = '1'; tip.style.transform = `translate(${Math.round(mx + margin.left + 12)}px, ${Math.round(my + margin.top + 12)}px)`; |
| } |
| function onLeave(){ tip.style.opacity='0'; tip.style.transform='translate(-9999px, -9999px)'; hoverLine.style('display','none'); } |
| overlay.on('mousemove', onMove).on('mouseleave', onLeave); |
| } |
| |
| |
| render(); |
| const rerender = () => render(); |
| if (window.ResizeObserver) { const ro = new ResizeObserver(() => rerender()); ro.observe(container); } else { window.addEventListener('resize', rerender); } |
| }; |
| |
| if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); } else { ensureD3(bootstrap); } |
| })(); |
| </script> |
| </div></div></figure> |
| <p>The blue line (effective) is the sum of the red + green, whereas the yellow would have been the progression without modular. We can see that the maintenance surface is essentially constant (in LOC) since the implementation of <code>modular</code>. |
| If you zoom in, you’ll notice there’s a sharp drop near the end, it’s essentially due to us <a href="https://github.com/huggingface/transformers/commit/4df2529d79d75f44e70396df5888a32ffa02d61e#diff-60849db3e9922197854ef1cac92bf4aba08b5d7fd3fe6f3c16a3511e29e0eacc">removing support for Jax and TensorFlow</a> library-wide.</p> |
| <p>But this was not the only effort that allowed us to reduce maintenance load.</p> |
| <p>We recently underwent a deep refactor of the attention implementation. You’ve likely heard about <a href="https://huggingface.co/docs/text-generation-inference/en/conceptual/flash_attention">flash attention</a> and its several variants.</p> |
| <p>The <em>attention computation</em> itself happens at a <em>lower</em> level of abstraction than the model itself.</p> |
| <p>However, we were adding specific torch operations for each backend (sdpa, the several flash-attention iterations, flex attention) but it wasn’t a <span class="glossary-term " style="" data-tooltip-id="tenet-zlbjzb8r96d" onmouseenter="window.showTenetTooltip(event, 'tenet-zlbjzb8r96d')" onmousemove="window.updateTenetTooltip(event, 'tenet-zlbjzb8r96d')" onmouseleave="window.hideTenetTooltip('tenet-zlbjzb8r96d')">minimal user api</span><span id="tenet-zlbjzb8r96d" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. Next section explains what we did.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Evidence: effective (i.e., maintainable) LOC growth drops ~15× when counting shards instead of expanded modeling files. Less code to read, fewer places to break.</p><p><strong>Next:</strong> how the attention interface stays standard without hiding semantics.</p> </div> </div> </div> </div> |
| <h3 id="-external-attention-classes"><a href="#-external-attention-classes"><a id="attention-classes"></a> External Attention classes</a></h3> |
| <p>The solution for the “attention abstraction problem” was to move to a standard <a href="https://huggingface.co/docs/transformers/en/attention_interface">attention interface</a> that allows the following:</p> |
| <p>The naive implementation of attention, called “eager”, is available by default. We use a <code>Callable</code> called <code>eager_attention_forward</code>, which can run as long as the user has PyTorch installed – which is a requirement any way.</p> |
| <p>Instead of using a class interface and a class hierarchy, we just moved to a function interface. When a more complex attention implementation is needed, we use other Callables, including much faster kernel bindings when available. The decision to use a different attention implementation is based on the model configuration file we download from the Hub, and it can also be overridden by the user.</p> |
| <p>This is a clear example that that we prefer an interface that is <span class="glossary-term " style="" data-tooltip-id="tenet-6sfwep5gdpe" onmouseenter="window.showTenetTooltip(event, 'tenet-6sfwep5gdpe')" onmousemove="window.updateTenetTooltip(event, 'tenet-6sfwep5gdpe')" onmouseleave="window.hideTenetTooltip('tenet-6sfwep5gdpe')">standard, but not abstract</span><span id="tenet-6sfwep5gdpe" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">standardize-dont-abstract</span> <span class="glossary-tooltip__definition">Model-specific logic belongs in the model file, not hidden behind abstractions.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. To be completely precise, this is what the interface selection looks like in transformers code:</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">attention_interface: Callable </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> eager_attention_forward</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">if</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">!=</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "eager"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> attention_interface </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ALL_ATTENTION_FUNCTIONS</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config._attn_implementation]</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>Having the attention interfaces functionalized allows to do dynamic switching of attentions as well, increasing their <span class="glossary-term " style="" data-tooltip-id="tenet-l5yeue88frp" onmouseenter="window.showTenetTooltip(event, 'tenet-l5yeue88frp')" onmousemove="window.updateTenetTooltip(event, 'tenet-l5yeue88frp')" onmouseleave="window.hideTenetTooltip('tenet-l5yeue88frp')">hackability</span><span id="tenet-l5yeue88frp" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">code-is-product</span> <span class="glossary-tooltip__definition">Optimize for reading, diffing, and tweaking. Code quality matters as much as functionality.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. |
| Another strength of the new attention interface is the possibility to enforce specific kwargs, which are needed by kernel providers and other dependencies.</p> |
| <p>Backend integrations sometimes require specific kwargs.</p> |
| <p>We know that kwargs are often a necessary evil that plagues tools with widespread compatibility; and it is something we have aimed to reduce, and will continue reduce in order to improve readability - with them, the current system is a <span class="glossary-term " style="" data-tooltip-id="tenet-yhd2lnc5p8e" onmouseenter="window.showTenetTooltip(event, 'tenet-yhd2lnc5p8e')" onmousemove="window.updateTenetTooltip(event, 'tenet-yhd2lnc5p8e')" onmouseleave="window.hideTenetTooltip('tenet-yhd2lnc5p8e')">minimal user api</span><span id="tenet-yhd2lnc5p8e" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>.</p> |
| <p>We reduce that surface and document expectations; where flexibility is necessary, we plan to use <code>typing.Annotated</code> to convey shapes and invariants without constraining integrations. Such an implementation could look like this in the future:</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">from</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> typing </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">import</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> Annotated</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">MyModelOutputAnnotated </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> Annotated[MyModelOutput, </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"shape: (B, C, H, W)"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span></span> |
| <span class="line"></span></code></pre></div> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Attention semantics remain in <code>eager_attention_forward</code>; faster backends are opt-in via config. We inform via types/annotations rather than enforce rigid kwargs, preserving integrations.</p><p><strong>Next:</strong> parallel partitioning is declared as a plan, not through model surgery.</p> </div> </div> </div> </div> |
| <h3 id="-configurable-tensor-parallelism"><a href="#-configurable-tensor-parallelism"><a id="simpler-tensor-parallelism"></a> Configurable Tensor Parallelism</a></h3> |
| <p>If you’re not familiar with the different flavours of parallelism, I recommend checking out <a href="https://huggingface.co/blog/accelerate-nd-parallel">this blog post</a> first, and of course a full <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook">dive into the ultra-scale playbook</a> is always recommended.</p> |
| <p>The essential part is that, as <a href="https://huggingface.co/docs/transformers/v4.56.2/perf_train_gpu_many#tensor-parallelism">the documentation states</a>, when tensors get too large to fit on a single GPU, they are sliced along a particular dimension and every slice is sent to a different GPU.</p> |
| <p>Why does it matter?</p> |
| <p>Because we want to avoid code modifications that are unrelated to the model.</p> |
| <p>We choose to place the level of abstraction higher than the device placement: a matrix multiplication - a <code>nn.Linear</code> layer - should be always expressed in the same way, regardless of how it is placed.</p> |
| <p>Hence, we want to touch the modeling code <span class="glossary-term " style="" data-tooltip-id="tenet-uhl4iqah7yr" onmouseenter="window.showTenetTooltip(event, 'tenet-uhl4iqah7yr')" onmousemove="window.updateTenetTooltip(event, 'tenet-uhl4iqah7yr')" onmouseleave="window.hideTenetTooltip('tenet-uhl4iqah7yr')">as little as possible</span><span id="tenet-uhl4iqah7yr" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>, and only modify it when <em>architectural changes</em> are involved – not depending on the way you run it. For tensor parallelism, we simply specify a <code>tp_plan</code>:</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"># In the model's config (example: ERNIE 4.5-style decoder blocks)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">base_model_tp_plan </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> {</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.self_attn.q_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"colwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.self_attn.k_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"colwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.self_attn.v_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"colwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.self_attn.o_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"rowwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.mlp.gate_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"colwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.mlp.up_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"colwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layers.*.mlp.down_proj"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"rowwise"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">}</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"># Runtime</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">import</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">from</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> transformers </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">import</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> AutoModelForCausalLM, AutoTokenizer</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">model_id </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "your/model-or-local-checkpoint"</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">model </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> AutoModelForCausalLM.from_pretrained( </span><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"># <-- will automatically map to the plan defined above</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> model_id, </span></span> |
| <span class="line"><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70"> dtype</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">torch.bfloat16,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">) </span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">tok </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> AutoTokenizer.from_pretrained(model_id)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">inputs </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> tok(</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"Hello"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">return_tensors</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"pt"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">).to(model.device)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">out </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> model(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">**</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">inputs)</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>The plan is written once, saved as part of the config and passed to <code>.from_pretrained()</code>. It maps module name patterns to partitioning strategies. Strategies are resolved by the internal <code>ParallelInterface</code>, which wires to sharding implementations <code>ColwiseParallel</code>, <code>RowwiseParallel</code>, packed variants, and so on.</p> |
| <p>The alternative would be to modify classes depending on supported types of parallelism.</p> |
| <p>The <code>tp_plan</code> solution allows users to run the same model on a single GPU, or distribute it using multiple processes per node, e.g. 4 GPUs:</p> |
| <p><code>torchrun --nproc-per-node 4 demo.py</code></p> |
| <p>Semantics stay in the model (a Linear stays a Linear), parallelization is orthogonal and declared via strings: “colwise” splits columns of weights/bias across ranks; “rowwise” splits rows; packed variants shard fused weights; The mapping keys accept glob patterns like <code>layers.*.mlp.down_proj</code> to target repeated submodules.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Parallelization is specified in the configuration (<code>tp_plan</code>), not through edits to <code>Linear</code>s. Glob patterns target repeated blocks; modeling semantics stay intact.</p><p><strong>Next:</strong> per-layer attention/caching schedules declared in config, not hardcoded.</p> </div> </div> </div> </div> |
| <h3 id="-layers-attentions-and-caches"><a href="#-layers-attentions-and-caches"><a id="layers-attentions-caches"></a> Layers, attentions and caches</a></h3> |
| <p>Following the same logic, the <em>nature</em> of attention and per-layer caching should not be hardcoded. We should be able to specify in the configuration how each layer is implemented. Thus, we define a mapping like:</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">ALLOWED_LAYER_TYPES</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> =</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> (</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "full_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "sliding_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "chunked_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "linear_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ...</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>and the configuration can be <em>explicit</em> about which attention type is in which layer. See, for example, <a href="https://huggingface.co/openai/gpt-oss-120b/blob/main/config.json#L15">gpt-oss</a>, which alternates sliding and full attention:</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "layer_types"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">: [</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "sliding_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "full_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ...</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "sliding_attention"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> "full_attention"</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> ],</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>This is <span class="glossary-term " style="" data-tooltip-id="tenet-03rvyaxz6g9g" onmouseenter="window.showTenetTooltip(event, 'tenet-03rvyaxz6g9g')" onmousemove="window.updateTenetTooltip(event, 'tenet-03rvyaxz6g9g')" onmouseleave="window.hideTenetTooltip('tenet-03rvyaxz6g9g')">minimal</span><span id="tenet-03rvyaxz6g9g" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script> to implement on the user side, and allows to keep the modeling code untouched. It is also easy to tweak.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Allowed layer types are explicit; schedules (e.g., sliding/full alternation) live in config. This keeps the file readable and easy to tweak.</p><p><strong>Next:</strong> speedups come from kernels that don’t change semantics.</p> </div> </div> </div> </div> |
| <h3 id="community-kernels"><a href="#community-kernels"><a id="community-kernels"></a>Community Kernels</a></h3> |
| <p>The same principle extends to normalization, activation, and other code paths. The model defines <strong>semantics</strong>; a kernel defines <strong>how</strong> to execute them faster. We annotate the module to borrow a community‑provided forward, keeping a <span class="glossary-term " style="" data-tooltip-id="tenet-8wzrsarwvyl" onmouseenter="window.showTenetTooltip(event, 'tenet-8wzrsarwvyl')" onmousemove="window.updateTenetTooltip(event, 'tenet-8wzrsarwvyl')" onmouseleave="window.hideTenetTooltip('tenet-8wzrsarwvyl')">consistent public surface</span><span id="tenet-8wzrsarwvyl" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">consistent-public-surface</span> <span class="glossary-tooltip__definition">Uniform naming, signatures, and conventions across all models for predictability.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script></p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">@use_kernel_forward_from_hub(</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"RMSNorm"</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> GlmRMSNorm</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ...</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>This also opens another contribution path: GPU specialists can contribute optimized kernels to the <a href="https://huggingface.co/kernels-community">Kernels Hub</a>, and have them immediately available to use in <code>transformers</code> and other libraries. You can check the <a href="https://huggingface.co/blog/hello-hf-kernels">kernel community blog post</a> to learn more about it!</p> |
| <p>Even more resources have been added, like the formidable <a href="https://github.com/huggingface/kernel-builder">kernel builder</a> with its connected resources to <a href="https://github.com/huggingface/kernel-builder/blob/main/docs/writing-kernels.md">help you build kernels with it</a> and <a href="https://github.com/huggingface/kernel-builder/blob/main/docs/nix.md">with nix</a>.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Models define semantics; kernels define how to run them faster. Use decorations to borrow community forwards while keeping a consistent public surface.</p><p><strong>Next:</strong> what modularity looks like across the repo.</p> </div> </div> </div> </div> |
| <h2 id="a-modular-state"><a href="#a-modular-state">A Modular State</a></h2> |
| <p>With <code>modular</code> transformers, we have a form of inheritance in our codebase. Some models become standards, and model contributors are given the opportunity to <em>define standards</em>. Pushing the boundaries of scientific knowledge can translate into the boundaries of engineering if this effort is made, and we’re striving for it. |
| It’s hard to conceptualize very large libraries and how their components interact with each other, regardless of your cognitive abilities for abstractions. |
| So I wanted to take a look at the current <strong>state of modularity</strong> across the repository. How many models are defined using components of others?</p> |
| <p>To get this graph, I used the heuristic of modular inheritance.</p> |
| <ol> |
| <li>Does this model have a <code>modular</code> file?</li> |
| <li>In this <code>modular</code> file, what models, configurations and processings are imported?</li> |
| <li>Recurse through the model list that way.</li> |
| </ol> |
| <p>So what do we see?</p> |
| <p>(Graph reading guide: nodes are models; edges are modular imports).</p> |
| <p>Check out the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer here</a> (tab “dependency graph”, hit “build graph”) for better manipulation and exploration.</p> |
| <div class="wide"> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-g502a7qa91"><iframe |
| src="https://molbap-dependencies-1.hf.space" |
| style="width:100%; height:680px; border:0" |
| allow="clipboard-read; clipboard-write; fullscreen" |
| referrerpolicy="no-referrer-when-downgrade" |
| ></iframe></div></div></figure> </div> |
| <p>Let’s walk through some sections of this graph together. |
| First, Llama is a basis and an influence for many models, and it is very visible.</p> |
| <div class="ri-root" data-ri-root="ri_nxt8ga618yr" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/llama_center.CbQ5MyAc_ZraNCd.webp" alt="Llama in the center" data-zoomable="1" data-astro-cid-6kov3kig width="1030" height="1015" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 2:</strong> Llama as a central model influencing many other models in the dependency graph.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <p>The models linked sometimes pull components from other models than <code>llama</code> of course. Radically different architectures such as mamba have spawned their own dependency subgraph.</p> |
| <p>Audio models form sparser archipelagos, see for instance wav2vec2 which is a significant basis for a dozen of them.</p> |
| <div class="ri-root" data-ri-root="ri_9hr9bnytw9m" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/cluster_wave2vec2.BvHBUP61_Z2vdkmW.webp" alt="Wav2vec2 influence" data-zoomable="1" data-astro-cid-6kov3kig width="608" height="563" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 3:</strong> Cluster of audio architectures based on wav2vec2, forming a specialized archipelago.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <p>In the case of VLMs which have massively grown in popularity since 2024, there’s far too many vision-based architectures that are not yet defined as modulars of other existing archs. In other words, there is no strong reference point in terms of software for vision models.</p> |
| <p>As you can see, there is a small <code>DETR</code> island:</p> |
| <div class="ri-root" data-ri-root="ri_kz6ikem8rwq" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/detr_island.CSrqELWy_1IAFDR.webp" alt="DETR archipelago" data-zoomable="1" data-astro-cid-6kov3kig width="591" height="606" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 4:</strong> Small DETR archipelago for vision models, less centralized than Llama for text.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <p>There is also a little llava pocket, and so on, but it’s not comparable to the centrality observed for llama.</p> |
| <p>Another problem is, this visualization only shows <code>modular</code> models. Several models still do NOT have a modular file. If we zoom out significantly, we can see them, the red nodes are models that do not have a modular file yet.</p> |
| <div class="ri-root" data-ri-root="ri_aa7595kjhfv" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/big_picture_zoomout.BKwXtSkj_1bNS6U.webp" alt="Red nodes" data-zoomable="1" data-astro-cid-6kov3kig width="1043" height="972" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 5:</strong> Overview showing red nodes (models without modular files) to be modularized.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <p>Hence the next question, and how do we identify modularisable models?</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Llama-lineage is a hub; several VLMs remain islands — engineering opportunity for shared parents. |
| <strong>Next:</strong> timeline + similarity signals to spot modularisable candidates.</p> </div> </div> </div> </div> |
| <h3 id="many-models-but-not-enough-yet-are-alike"><a href="#many-models-but-not-enough-yet-are-alike">Many models, but not enough yet, are alike</a></h3> |
| <p>I looked into Jaccard similarity, which we use to measure set differences, to find similarities across models. I know that code is more than a set of characters stringed together. We also tried code-embedding models that ranked candidates better in practice, but for this post we stick to the deterministic Jaccard index.</p> |
| <p>It is interesting, for our comparison, to look at <em>when</em> we deployed the modular logic and what was its rippling effect on the library. Looking at the timeline makes it obvious: adding modular allowed to connect more and more models to solid reference points.</p> |
| <p>Yet, we still have a lot of gaps to fill.</p> |
| <p>Zoom out below - it’s full of models. You can click on a node to see its connections better, or use the text box to search for a model. You can use the <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">full viewer</a> (tab “timeline”, hit “build timeline”) for better exploration.</p> |
| <div class="wide"> <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-8ei9z6zza54"> <iframe |
| src="https://molbap-timeline-1.hf.space" |
| style="width:100%; height:680px; border:0" |
| allow="clipboard-read; clipboard-write; fullscreen" |
| referrerpolicy="no-referrer-when-downgrade" |
| ></iframe></div></div></figure> </div> |
| <p>Let’s look at a few highly connected models. Let’s start by the foundational work of <a href="https://arxiv.org/abs/2304.08485">Llava</a>.</p> |
| <div class="ri-root" data-ri-root="ri_9y5lnalbey6" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/timeline_llava.Bne5RSo9_Z26WEKX.webp" alt="Llava in its timeline" data-zoomable="1" data-astro-cid-6kov3kig width="1250" height="770" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 6:</strong> LLaVA and its variants in the timeline, with llava_video as a candidate for modularization.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <p>You see that <code>llava_video</code> is a red node, connected by a red edge to <code>llava</code>: it’s a candidate, something that we can <em>likely</em> remodularize, <span class="glossary-term " style="" data-tooltip-id="tenet-9fefkl0jqgh" onmouseenter="window.showTenetTooltip(event, 'tenet-9fefkl0jqgh')" onmousemove="window.updateTenetTooltip(event, 'tenet-9fefkl0jqgh')" onmouseleave="window.hideTenetTooltip('tenet-9fefkl0jqgh')">not touching the actual model</span><span id="tenet-9fefkl0jqgh" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">backwards-compatibility</span> <span class="glossary-tooltip__definition">Any artifact once on the hub must remain loadable. Breaking changes are unacceptable.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script> but being much more readable with <span class="glossary-term " style="" data-tooltip-id="tenet-0sb86mxym6sb" onmouseenter="window.showTenetTooltip(event, 'tenet-0sb86mxym6sb')" onmousemove="window.updateTenetTooltip(event, 'tenet-0sb86mxym6sb')" onmouseleave="window.hideTenetTooltip('tenet-0sb86mxym6sb')">DRY*</span><span id="tenet-0sb86mxym6sb" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">do-repeat-yourself</span> <span class="glossary-tooltip__definition">Strategic duplication can improve readability and maintainability when done thoughtfully.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>.</p> |
| <p>The same can be identified with the classical encoders family, centered on <code>BERT</code>:</p> |
| <p>Here <code>roberta</code>, <code>xlm_roberta</code>, <code>ernie</code> are <code>modular</code>s of BERT, while models like <code>mobilebert</code> are likely candidates.</p> |
| <div class="ri-root" data-ri-root="ri_odwq8ysmzlh" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/classic_encoders.BSgQl9lp_3OtlT.webp" alt="Classical encoders" data-zoomable="1" data-astro-cid-6kov3kig width="1274" height="749" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 7:</strong> Family of classical encoders centered on BERT, with several models already modularized.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Similarity metrics (Jaccard index or embeddings) surfaces likely parents; the timeline shows consolidation after modular landed. Red nodes/edges = candidates (e.g., <code>llava_video</code> → <code>llava</code>) for refactors that preserve behavior.</p><p><strong>Next:</strong> concrete VLM choices that avoid leaky abstractions.</p> </div> </div> </div> </div> |
| <h3 id="vlm-improvements-avoiding-abstraction"><a href="#vlm-improvements-avoiding-abstraction">VLM improvements, avoiding abstraction</a></h3> |
| <p>We don’t yet have a cookbook for common VLM patterns (image token scatter, multi‑tower encoders, cross‑attention bridges). This is one of the main improvement points where we can work.</p> |
| <p>For instance, we thought of abstracting away the mixing of <code>inputs_embeds</code>, the tensor fed into an LLM decoder in 95% of the existing VLMs. It would have looked like something like</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">class</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> InputsEmbeddingMixerMixin</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">nn</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">Module</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">):</span></span> |
| <span class="line"><span style="--shiki-light:#6A737D;--shiki-dark:#6A737D"> #</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>But this is <span class="glossary-term " style="" data-tooltip-id="tenet-rz8osrul6qc" onmouseenter="window.showTenetTooltip(event, 'tenet-rz8osrul6qc')" onmousemove="window.updateTenetTooltip(event, 'tenet-rz8osrul6qc')" onmouseleave="window.hideTenetTooltip('tenet-rz8osrul6qc')">not an abstraction</span><span id="tenet-rz8osrul6qc" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">standardize-dont-abstract</span> <span class="glossary-tooltip__definition">Model-specific logic belongs in the model file, not hidden behind abstractions.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. Embedding mixin is part of the model, removing it would break it. A user opening <a href="https://github.com/huggingface/transformers/blob/b3bd815786c36f4e6c3791fae0a96cac86658b32/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1358"><code>modeling_qwen2.5_vl</code></a> (check out the <a href="https://huggingface.co/collections/Qwen/qwen25-vl-6795ffac22b334a837c0f9a5">Qwen2.5VL collection</a>) should not have to go to another file to understand how it works.</p> |
| <p>What is the current state of these “abstractions” across the codebase? |
| You will see all the imports around a modeling file, here <a href="https://huggingface.co/google/gemma-3n-E4B-it">Gemma3n</a>.</p> |
| <div class="ri-root" data-ri-root="ri_h0sxkevsd9k" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/still_graph_bloat.BII6Am4a_Z2d5KVT.webp" alt="Gemma3n graph" data-zoomable="1" data-astro-cid-6kov3kig width="2580" height="1207" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 8:</strong> Gemma3n import graph showing dependency complexity, with GenerationMixin very central.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <p>As you can see, the <code>GenerationMixin</code> node is already very heavy. It encompasses all of the utilities around <code>.generate</code>, it is second only to <code>nn.Module</code>. |
| That means every decision we make to abstract something else has to be extremely careful.</p> |
| <p>The following <a href="https://github.com/huggingface/transformers/pull/39777">Pull request to standardize placeholder masking</a> is a good example of what kind of changes are acceptable. In a VLM, we always need to insert embeddings from various encoders at various positions, so we can have a function to do it. For Qwen2 VL, for instance, it will look like this:</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="python"><code><span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> def</span><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0"> get_placeholder_mask</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> self,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> input_ids: torch.LongTensor,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> inputs_embeds: torch.FloatTensor,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> image_features: torch.FloatTensor </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> video_features: torch.FloatTensor </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">,</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> ):</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> """</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> equal to the length of multimodal features. If the lengths are different, an error is raised.</span></span> |
| <span class="line"><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> """</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> if</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> input_ids </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">is</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> inputs_embeds </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">==</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.get_input_embeddings()(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch.tensor(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config.image_token_id, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dtype</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">torch.long, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">device</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">inputs_embeds.device)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask.all(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> inputs_embeds </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">==</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.get_input_embeddings()(</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> torch.tensor(</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config.video_token_id, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">dtype</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">torch.long, </span><span style="--shiki-light:#E36209;--shiki-dark:#FFAB70">device</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">inputs_embeds.device)</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask.all(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> else</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">:</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> input_ids </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">==</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config.image_token_id</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> input_ids </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">==</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> self</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">.config.video_token_id</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> n_image_tokens </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask.sum()</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask.unsqueeze(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">).expand_as(inputs_embeds).to(inputs_embeds.device)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> if</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> image_features </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">is</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> not</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> and</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> inputs_embeds[special_image_mask].numel() </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">!=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> image_features.numel():</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> raise</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ValueError</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> f</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"Image features and image tokens do not match: tokens: </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">{</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">n_image_tokens</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">}</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">, features </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">{</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">image_features.shape[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">0</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">}</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> n_video_tokens </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask.sum()</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_video_mask.unsqueeze(</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">-</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">1</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">).expand_as(inputs_embeds).to(inputs_embeds.device)</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> if</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> video_features </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">is</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> not</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> None</span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> and</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> inputs_embeds[special_video_mask].numel() </span><span style="--shiki-light:#D73A49;--shiki-dark:#F97583">!=</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> video_features.numel():</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> raise</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> ValueError</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">(</span></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> f</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"Videos features and video tokens do not match: tokens: </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">{</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">n_video_tokens</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">}</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">, features </span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">{</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">video_features.shape[</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">0</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">]</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF">}</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> )</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#D73A49;--shiki-dark:#F97583"> return</span><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8"> special_image_mask, special_video_mask</span></span> |
| <span class="line"></span></code></pre></div> |
| <p>But this is <em>within</em> the modeling file, not in the <code>PreTrainedModel</code> base class. It will not move away from it, because it’d break the <span class="glossary-term " style="" data-tooltip-id="tenet-m8wjbvzuoa" onmouseenter="window.showTenetTooltip(event, 'tenet-m8wjbvzuoa')" onmousemove="window.updateTenetTooltip(event, 'tenet-m8wjbvzuoa')" onmouseleave="window.hideTenetTooltip('tenet-m8wjbvzuoa')">One model, one file tenet.</span><span id="tenet-m8wjbvzuoa" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">one-model-one-file</span> <span class="glossary-tooltip__definition">All inference and training core logic visible, top‑to‑bottom, in a single file.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script></p> |
| <p>What do we conclude? Going forward, we should aim for VLMs to have a form of centrality similar to that of <code>Llama</code> for text-only models. This centrality should not be achieved at the cost of abstracting and hiding away crucial inner workings of said models.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Keep VLM embedding mix in the modeling file (semantics), standardize safe helpers (e.g., placeholder masking), don’t migrate behavior to <code>PreTrainedModel</code>. |
| <strong>Next:</strong> pipeline-level wins that came from PyTorch-first choices (fast processors).</p> </div> </div> </div> </div> |
| <h3 id="on-image-processing-and-processors"><a href="#on-image-processing-and-processors">On image processing and processors</a></h3> |
| <p>Deciding to become a <code>torch</code>-first library meant relieving a tremendous amount of support for <code>jax </code> and <code>TensorFlow</code>, and it also meant that we could be more lenient about the amount of torch-dependent utilities that we were able to accept. One of these is the <em>fast processing</em> of images. Where inputs were once minimally assumed to be ndarrays, enforcing native <code>torch</code> and <code>torchvision</code> inputs allowed us to massively improve processing speed for each model.</p> |
| <p>The gains in performance are immense, up to 20x speedup for most models when using compiled torchvision ops. Furthermore, lets us run the whole pipeline solely on GPU.</p> |
| <div class="ri-root" data-ri-root="ri_1mtaydnyoyc" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/fast_image_processors.D3x5vY3o_2cacGa.webp" alt="Fast Image Processors Performance" data-zoomable="1" data-astro-cid-6kov3kig width="2251" height="2409" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 9:</strong> Performance gains of fast image processors, up to 20x acceleration with compiled torchvision.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>PyTorch-first lets processors assume torch/torchvision and run the whole pipeline on GPU; big per-model speedups.</p><p><strong>Next:</strong> how this lowers friction for contributors and downstream users.</p> </div> </div> </div> </div> |
| <h2 id="reduce-barrier-to-entrycontribution"><a href="#reduce-barrier-to-entrycontribution">Reduce barrier to entry/contribution</a></h2> |
| <p>This is an overall objective: there’s no <code>transformers</code> without its community.</p> |
| <p>Having a framework means forcing users into it. It restrains flexibility and creativity, which are the fertile soil for new ideas to grow.</p> |
| <p>Among the most valuable contributions to <code>transformers</code> is of course the addition of new models. Very recently, <a href="https://huggingface.co/blog/welcome-openai-gpt-oss">OpenAI added GPT-OSS</a>, which prompted the addition of many new features to the library in order to support <a href="https://huggingface.co/openai/gpt-oss-120b">their model</a>.</p> |
| <p>These additions are immediately available for other models to use.</p> |
| <p>Another important advantage is the ability to fine-tune and pipeline these models into many other libraries and tools. Check here on the hub how many finetunes are registered for <a href="https://huggingface.co/models?other=base_model:finetune:openai/gpt-oss-120b">gpt-oss 120b</a>, despite its size!</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>The shape of a contribution: add a model (or variant) with a small modular shard; the community and serving stacks pick it up immediately. Popularity trends (encoders/embeddings) guide where we invest.</p><p><strong>Next:</strong> power tools enabled by a consistent API.</p> </div> </div> </div> </div> |
| <h3 id="-models-popularity"><a href="#-models-popularity"><a id="encoders-ftw"></a> Models popularity</a></h3> |
| <p>Talking about dependencies, we can take a look at the number of downloads as a measure of popularity. One thing we see is the prominence of encoders, despite the apparent prevalence of decoder LLMs. The reason is that encoders are used to generate embeddings, which have multiple downstream uses. Just check out <a href="https://huggingface.co/blog/embeddinggemma">EmbeddingGemma</a> for a modern recap. Hence, it is vital to keep the encoders portion of the library viable, usable, fine-tunable.</p> |
| <div><figure class="html-embed"><div class="html-embed__card"><div id="frag-n1hiiye0bdk"><div class="d3-model-popularity"></div> |
| <style> |
| .d3-model-popularity .controls { |
| margin-top: 0; |
| display: flex; |
| gap: 16px; |
| align-items: center; |
| justify-content: flex-end; |
| flex-wrap: wrap; |
| } |
| |
| .d3-model-popularity .controls .control-group { |
| display: flex; |
| flex-direction: column; |
| align-items: flex-start; |
| gap: 6px; |
| } |
| |
| .d3-model-popularity .controls label { |
| font-size: 12px; |
| color: var(--text-color); |
| font-weight: 700; |
| } |
| |
| .d3-model-popularity .controls select { |
| font-size: 12px; |
| padding: 8px 28px 8px 10px; |
| border: 1px solid var(--border-color); |
| border-radius: 8px; |
| background-color: var(--surface-bg); |
| color: var(--text-color); |
| background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%230f1115' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'/%3E%3C/svg%3E"); |
| background-repeat: no-repeat; |
| background-position: right 8px center; |
| background-size: 12px; |
| -webkit-appearance: none; |
| -moz-appearance: none; |
| appearance: none; |
| cursor: pointer; |
| transition: border-color .15s ease, box-shadow .15s ease; |
| } |
| |
| [data-theme="dark"] .d3-model-popularity .controls select { |
| background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='12' height='12' viewBox='0 0 24 24' fill='none' stroke='%23ffffff' stroke-width='2' stroke-linecap='round' stroke-linejoin='round'%3E%3Cpolyline points='6 9 12 15 18 9'/%3E%3C/svg%3E"); |
| } |
| |
| .d3-model-popularity .controls select:hover { |
| border-color: var(--primary-color); |
| } |
| |
| .d3-model-popularity .controls select:focus { |
| border-color: var(--primary-color); |
| box-shadow: 0 0 0 3px rgba(232, 137, 171, .25); |
| outline: none; |
| } |
| |
| |
| .d3-model-popularity .chart-header { |
| display: flex; |
| align-items: flex-start; |
| justify-content: flex-start; |
| gap: 12px; |
| margin: 8px 0 0 0; |
| flex-wrap: wrap; |
| } |
| |
| .d3-model-popularity .legend-bottom { |
| display: flex; |
| flex-direction: column; |
| align-items: flex-start; |
| gap: 6px; |
| font-size: 12px; |
| color: var(--text-color); |
| } |
| |
| .d3-model-popularity .legend-bottom .legend-title { |
| font-size: 12px; |
| font-weight: 700; |
| color: var(--text-color); |
| } |
| |
| .d3-model-popularity .legend-bottom .items { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 8px 14px; |
| } |
| |
| .d3-model-popularity .legend-bottom .item { |
| display: inline-flex; |
| align-items: center; |
| gap: 6px; |
| white-space: nowrap; |
| } |
| |
| .d3-model-popularity .legend-bottom .swatch { |
| width: 14px; |
| height: 14px; |
| border-radius: 3px; |
| border: 1px solid var(--border-color); |
| display: inline-block; |
| } |
| |
| .d3-model-popularity .axis-label { |
| fill: var(--text-color); |
| font-size: 12px; |
| font-weight: 700; |
| } |
| |
| |
| .d3-model-popularity .axes path, |
| .d3-model-popularity .axes line { |
| stroke: var(--axis-color); |
| } |
| |
| .d3-model-popularity .axes text { |
| fill: var(--tick-color); |
| } |
| |
| .d3-model-popularity .grid line { |
| stroke: var(--grid-color); |
| } |
| |
| |
| .d3-model-popularity .d3-tooltip { |
| z-index: var(--z-tooltip); |
| backdrop-filter: saturate(1.12) blur(8px); |
| } |
| |
| |
| .d3-model-popularity .bars rect { |
| transition: opacity .12s ease; |
| } |
| |
| .d3-model-popularity .bars rect:hover { |
| opacity: 0.8; |
| } |
| |
| |
| .d3-model-popularity .chart-card { |
| background: var(--surface-bg); |
| border: 1px solid var(--border-color); |
| border-radius: 10px; |
| padding: 8px; |
| } |
| |
| |
| .d3-model-popularity .chart-header { |
| display: flex; |
| justify-content: center; |
| align-items: center; |
| margin-bottom: 20px; |
| padding: 0; |
| } |
| |
| .d3-model-popularity .controls { |
| justify-content: center; |
| min-width: 200px; |
| } |
| |
| .d3-model-popularity .controls .control-group { |
| min-width: 150px; |
| } |
| |
| .d3-model-popularity .controls select { |
| font-size: 13px; |
| min-width: 160px; |
| } |
| |
| |
| .d3-model-popularity .model-name { |
| font-size: 11px; |
| fill: var(--tick-color); |
| text-anchor: end; |
| } |
| |
| |
| .d3-model-popularity .bars rect { |
| transition: opacity .15s ease, stroke .15s ease; |
| } |
| |
| .d3-model-popularity .bars rect:hover { |
| stroke: var(--text-color); |
| stroke-width: 1px; |
| } |
| </style> |
| <script> |
| (() => { |
| const ensureD3 = (cb) => { |
| if (window.d3 && typeof window.d3.select === 'function') return cb(); |
| let s = document.getElementById('d3-cdn-script'); |
| if (!s) { s = document.createElement('script'); s.id = 'd3-cdn-script'; s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; document.head.appendChild(s); } |
| const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); }; |
| s.addEventListener('load', onReady, { once: true }); |
| if (window.d3) onReady(); |
| }; |
| |
| const bootstrap = () => { |
| const scriptEl = document.currentScript; |
| let container = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!(container && container.classList && container.classList.contains('d3-model-popularity'))) { |
| const candidates = Array.from(document.querySelectorAll('.d3-model-popularity')) |
| .filter((el) => !(el.dataset && el.dataset.mounted === 'true')); |
| container = candidates[candidates.length - 1] || null; |
| } |
| if (!container) return; |
| if (container.dataset) { |
| if (container.dataset.mounted === 'true') return; |
| container.dataset.mounted = 'true'; |
| } |
| |
| |
| let mountEl = container; |
| while (mountEl && !mountEl.getAttribute?.('data-datafiles') && !mountEl.getAttribute?.('data-config')) { |
| mountEl = mountEl.parentElement; |
| } |
| let providedData = null; |
| try { |
| const attr = mountEl && mountEl.getAttribute ? mountEl.getAttribute('data-datafiles') : null; |
| if (attr && attr.trim()) { |
| providedData = attr.trim().startsWith('[') ? JSON.parse(attr) : attr.trim(); |
| } |
| } catch(_) {} |
| |
| const ensureDataPrefix = (p) => (typeof p === 'string' && p && !p.includes('/')) ? `/data/${p}` : p; |
| const normalizeInput = (inp) => Array.isArray(inp) |
| ? inp.map(ensureDataPrefix) |
| : (typeof inp === 'string' ? [ ensureDataPrefix(inp) ] : null); |
| |
| const CSV_PATHS = Array.isArray(providedData) |
| ? normalizeInput(providedData) |
| : (typeof providedData === 'string' ? normalizeInput(providedData) || ['/data/model_popularity_by_downloads.csv'] : [ |
| '/data/model_popularity_by_downloads.csv', |
| '/data/model_popularity_by_last_modified.csv', |
| './assets/data/model_popularity_by_downloads.csv', |
| './assets/data/model_popularity_by_last_modified.csv', |
| '../assets/data/model_popularity_by_downloads.csv', |
| '../assets/data/model_popularity_by_last_modified.csv', |
| '../../assets/data/model_popularity_by_downloads.csv', |
| '../../assets/data/model_popularity_by_last_modified.csv' |
| ]); |
| |
| const fetchFirstAvailable = async (paths) => { |
| for (const p of paths) { |
| try { |
| const r = await fetch(p, { cache: 'no-cache' }); |
| if (r.ok) return await r.text(); |
| } catch(_){} |
| } |
| throw new Error('CSV not found'); |
| }; |
| |
| |
| const controls = document.createElement('div'); |
| controls.className = 'controls'; |
| const groupMetric = document.createElement('div'); |
| groupMetric.className = 'control-group'; |
| const labelMetric = document.createElement('label'); |
| labelMetric.textContent = 'Metric'; |
| const selMetric = document.createElement('select'); |
| |
| |
| const metrics = [ |
| { value: 'downloads', text: 'Sort by Downloads' }, |
| { value: 'last_modified', text: 'Sort by Last Modified' } |
| ]; |
| metrics.forEach((m) => { |
| const o = document.createElement('option'); |
| o.value = m.value; |
| o.textContent = m.text; |
| selMetric.appendChild(o); |
| }); |
| |
| groupMetric.appendChild(labelMetric); |
| groupMetric.appendChild(selMetric); |
| |
| |
| const header = document.createElement('div'); |
| header.className = 'chart-header'; |
| header.appendChild(controls); |
| |
| |
| container.appendChild(header); |
| |
| |
| const card = document.createElement('div'); |
| card.className = 'chart-card'; |
| container.appendChild(card); |
| const svg = d3.select(card).append('svg').attr('width', '100%').style('display', 'block'); |
| const gRoot = svg.append('g'); |
| const gGrid = gRoot.append('g').attr('class', 'grid'); |
| const gAxes = gRoot.append('g').attr('class', 'axes'); |
| const gBars = gRoot.append('g').attr('class', 'bars'); |
| |
| |
| container.style.position = container.style.position || 'relative'; |
| let tip = container.querySelector('.d3-tooltip'); |
| let tipInner; |
| if (!tip) { |
| tip = document.createElement('div'); |
| tip.className = 'd3-tooltip'; |
| Object.assign(tip.style, { |
| position: 'absolute', |
| top: '0px', |
| left: '0px', |
| transform: 'translate(-9999px, -9999px)', |
| pointerEvents: 'none', |
| padding: '8px 10px', |
| borderRadius: '8px', |
| fontSize: '12px', |
| lineHeight: '1.35', |
| border: '1px solid var(--border-color)', |
| background: 'var(--surface-bg)', |
| color: 'var(--text-color)', |
| boxShadow: '0 4px 24px rgba(0,0,0,.18)', |
| opacity: '0', |
| transition: 'opacity .12s ease' |
| }); |
| tipInner = document.createElement('div'); |
| tipInner.className = 'd3-tooltip__inner'; |
| tipInner.style.textAlign = 'left'; |
| tip.appendChild(tipInner); |
| container.appendChild(tip); |
| } else { |
| tipInner = tip.querySelector('.d3-tooltip__inner') || tip; |
| } |
| |
| |
| let currentMetric = 'downloads'; |
| let data = []; |
| selMetric.value = currentMetric; |
| |
| |
| let width = 800, height = 1000; |
| const margin = { top: 16, right: 30, bottom: 80, left: 290 }; |
| const y = d3.scaleBand().padding(0.2); |
| const x = d3.scaleLinear(); |
| |
| function getCategoricalColors(count) { |
| try { |
| if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') { |
| return window.ColorPalettes.getColors('categorical', count); |
| } |
| } catch (_) { } |
| |
| return ['#636efa', '#EF553B', '#00cc96', '#ab63fa', '#FFA15A', '#19d3f3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']; |
| } |
| |
| function formatNumber(num) { |
| if (num >= 1000000) { |
| return (num / 1000000).toFixed(1) + 'M'; |
| } else if (num >= 1000) { |
| return (num / 1000).toFixed(1) + 'K'; |
| } |
| return num.toString(); |
| } |
| |
| function getModelType(name) { |
| const lower = name.toLowerCase(); |
| if (lower.includes('clip')) return 'clip'; |
| if (lower.includes('vit')) return 'vit'; |
| if (lower.includes('resnet')) return 'resnet'; |
| if (lower.includes('whisper')) return 'whisper'; |
| if (lower.includes('blip')) return 'blip'; |
| if (lower.includes('qwen')) return 'qwen2_vl'; |
| if (lower.includes('gemma')) return 'gemma3'; |
| if (lower.includes('dinov2')) return 'dinov2'; |
| if (lower.includes('siglip')) return 'siglip'; |
| if (lower.includes('table')) return 'table_transformer'; |
| if (lower.includes('layout')) return 'layoutlmv3'; |
| if (lower.includes('music')) return 'musicgen'; |
| if (lower.includes('internvl')) return 'internvl'; |
| if (lower.includes('clipseg')) return 'clipseg'; |
| if (lower.includes('vitmatte')) return 'vitmatte'; |
| if (lower.includes('mobilevit')) return 'mobilevit'; |
| if (lower.includes('unidepth')) return 'vit'; |
| if (lower.includes('align')) return 'align'; |
| if (lower.includes('bit')) return 'bit'; |
| if (lower.includes('bert')) return 'd_fine'; |
| if (lower.includes('fashion')) return 'clip'; |
| if (lower.includes('age')) return 'vit'; |
| if (lower.includes('nsfw')) return 'vit'; |
| return 'other'; |
| } |
| |
| async function loadData() { |
| try { |
| const csvText = await fetchFirstAvailable(CSV_PATHS); |
| const parsed = d3.csvParse(csvText); |
| |
| |
| data = parsed.map(d => ({ |
| name: d['Model Name'].replace(/"/g, ''), |
| downloads: parseInt(d.Downloads.replace(/"/g, '')) || 0 |
| })).filter(d => d.downloads > 0); |
| |
| |
| const uniqueData = new Map(); |
| data.forEach(d => { |
| if (!uniqueData.has(d.name) || uniqueData.get(d.name).downloads < d.downloads) { |
| uniqueData.set(d.name, d); |
| } |
| }); |
| |
| data = Array.from(uniqueData.values()); |
| |
| |
| if (currentMetric === 'downloads') { |
| data.sort((a, b) => b.downloads - a.downloads); |
| } else { |
| |
| |
| |
| } |
| |
| |
| data.forEach(d => { |
| d.type = getModelType(d.name); |
| }); |
| |
| update(); |
| } catch (error) { |
| console.error('Error loading data:', error); |
| const errorDiv = document.createElement('pre'); |
| errorDiv.style.color = 'red'; |
| errorDiv.style.padding = '16px'; |
| errorDiv.textContent = `Error loading data: ${error.message}`; |
| container.appendChild(errorDiv); |
| } |
| } |
| |
| function updateScales() { |
| width = container.clientWidth || 800; |
| |
| const minHeightPerModel = 25; |
| const calculatedHeight = Math.max(400, data.length * minHeightPerModel + margin.top + margin.bottom); |
| height = Math.min(1200, calculatedHeight); |
| svg.attr('width', width).attr('height', height); |
| const innerWidth = width - margin.left - margin.right; |
| const innerHeight = height - margin.top - margin.bottom; |
| gRoot.attr('transform', `translate(${margin.left},${margin.top})`); |
| |
| |
| y.domain(data.map(d => d.name)).range([0, innerHeight]); |
| x.domain([0, d3.max(data, d => d.downloads) || 1]).range([0, innerWidth]).nice(); |
| |
| |
| gGrid.selectAll('*').remove(); |
| gGrid.selectAll('line').data(x.ticks(6)).join('line') |
| .attr('x1', (d) => x(d)).attr('x2', (d) => x(d)).attr('y1', 0).attr('y2', innerHeight) |
| .attr('stroke', 'var(--grid-color)').attr('stroke-width', 1).attr('shape-rendering', 'crispEdges'); |
| |
| |
| gAxes.selectAll('*').remove(); |
| |
| |
| gAxes.append('g').attr('transform', `translate(0,${innerHeight})`) |
| .call(d3.axisBottom(x).ticks(6).tickFormat(d3.format('~s'))) |
| .call((g) => { |
| g.selectAll('path, line').attr('stroke', 'var(--axis-color)'); |
| g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', '12px'); |
| }); |
| |
| |
| gAxes.append('g') |
| .call(d3.axisLeft(y)) |
| .call((g) => { |
| g.selectAll('path, line').attr('stroke', 'var(--axis-color)'); |
| g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size', '11px'); |
| }); |
| |
| |
| gAxes.append('text').attr('class', 'axis-label axis-label--x') |
| .attr('x', innerWidth / 2).attr('y', innerHeight + 64) |
| .attr('text-anchor', 'middle').text('Downloads'); |
| |
| |
| return { innerWidth, innerHeight }; |
| } |
| |
| function renderLegend() { |
| |
| } |
| |
| function drawBars() { |
| if (!data || data.length === 0) return; |
| |
| const { innerWidth, innerHeight } = updateScales(); |
| |
| |
| const types = [...new Set(data.map(d => d.type))]; |
| const colors = getCategoricalColors(types.length); |
| |
| |
| const shuffledColors = [...colors].sort(() => Math.random() - 0.5); |
| const colorMap = new Map(types.map((type, i) => [type, shuffledColors[i]])); |
| |
| const bars = gBars.selectAll('rect').data(data, d => d.name); |
| |
| bars.enter().append('rect') |
| .attr('y', d => y(d.name)) |
| .attr('x', 1) |
| .attr('width', 0) |
| .attr('height', y.bandwidth()) |
| .attr('fill', d => colorMap.get(d.type)) |
| .on('mouseenter', function (ev, d) { |
| tipInner.innerHTML = ` |
| <div><strong>${d.name}</strong></div> |
| <div><strong>Type:</strong> ${d.type}</div> |
| <div><strong>Downloads:</strong> ${d.downloads.toLocaleString()}</div> |
| `; |
| tip.style.opacity = '1'; |
| }) |
| .on('mousemove', function (ev, d) { |
| const [mx, my] = d3.pointer(ev, container); |
| const offsetX = 12, offsetY = 12; |
| const maxX = (container.clientWidth || 0) - (tip.offsetWidth + 6); |
| const maxY = (container.clientHeight || 0) - (tip.offsetHeight + 6); |
| const tx = Math.max(0, Math.min(mx + offsetX, maxX)); |
| const ty = Math.max(0, Math.min(my + offsetY, maxY)); |
| tip.style.transform = `translate(${Math.round(tx)}px, ${Math.round(ty)}px)`; |
| }) |
| .on('mouseleave', function () { |
| tip.style.opacity = '0'; |
| tip.style.transform = 'translate(-9999px, -9999px)'; |
| }) |
| .merge(bars) |
| .transition().duration(600) |
| .attr('y', d => y(d.name)) |
| .attr('x', 1) |
| .attr('width', d => x(d.downloads)) |
| .attr('height', y.bandwidth()) |
| .attr('fill', d => colorMap.get(d.type)); |
| |
| bars.exit().remove(); |
| |
| renderLegend(); |
| } |
| |
| function update() { |
| drawBars(); |
| } |
| |
| |
| selMetric.addEventListener('change', async (e) => { |
| currentMetric = e.target.value; |
| |
| const newPaths = currentMetric === 'downloads' |
| ? ['/data/model_popularity_by_downloads.csv', './assets/data/model_popularity_by_downloads.csv', '../assets/data/model_popularity_by_downloads.csv'] |
| : ['/data/model_popularity_by_last_modified.csv', './assets/data/model_popularity_by_last_modified.csv', '../assets/data/model_popularity_by_last_modified.csv']; |
| |
| CSV_PATHS.splice(0, CSV_PATHS.length, ...newPaths); |
| await loadData(); |
| }); |
| |
| |
| controls.appendChild(groupMetric); |
| loadData(); |
| |
| const rerender = () => { update(); }; |
| if (window.ResizeObserver) { |
| const ro = new ResizeObserver(() => rerender()); |
| ro.observe(container); |
| } else { |
| window.addEventListener('resize', rerender); |
| } |
| }; |
| |
| if (document.readyState === 'loading') { |
| document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); |
| } else { |
| ensureD3(bootstrap); |
| } |
| })(); |
| </script> |
| </div></div></figure></div> |
| <p>As the codebase grows, we need to maintain it in coordination with our friend <a href="https://huggingface.co/sentence-transformers">Sentence Transformers codebase</a>. Retrieval use-cases, smart databases, FAISS-based indexing rely on it, and thus indirectly on transformers.</p> |
| <p>In that regard, we DO want to be a modular toolbox, being <span class="glossary-term " style="" data-tooltip-id="tenet-h6nw2z388vr" onmouseenter="window.showTenetTooltip(event, 'tenet-h6nw2z388vr')" onmousemove="window.updateTenetTooltip(event, 'tenet-h6nw2z388vr')" onmouseleave="window.hideTenetTooltip('tenet-h6nw2z388vr')">minimal</span><span id="tenet-h6nw2z388vr" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">minimal-user-api</span> <span class="glossary-tooltip__definition">Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. Least amount of codepaths.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script> enough and well documented enough so any ML/AI developer can use <code>transformers</code> without having to think about it. We aim to reduce the cognitive load brought about by model development, not increase it.</p> |
| <p>So, how do these design choices, these “tenets” influence development of models and overall usage of transformers?</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Encoders remain critical for embeddings and retrieval; maintaining them well benefits the broader ecosystem (e.g., Sentence Transformers, FAISS).</p><p><strong>Next:</strong> dev tools that leverage unified attention APIs and PyTorch-only internals.</p> </div> </div> </div> </div> |
| <h2 id="a-surgical-toolbox-for-model-development"><a href="#a-surgical-toolbox-for-model-development">A surgical toolbox for model development</a></h2> |
| <p>Transformers provides many tools that can help you add a new architecture, understand the inner workings of a model, as well as the library itself.</p> |
| <h3 id="attention-visualisation"><a href="#attention-visualisation">Attention visualisation</a></h3> |
| <p>All models have the same API for attention computation, thanks to <a href="#external-attention-classes">the externalisation of attention classes</a>.</p> |
| <p>This uniformity allows us to build cool tools to visualize the inner workings of the attention mechanism.</p> |
| <p>One particular piece of machinery is the <code>attention mask</code>. Here you see the famous bidirectional attention pattern for the whole prefix (text + image) in PaliGemma and all Gemma2+ models, contrasting with the usual “causal-only” models.</p> |
| <figure class="html-embed"><div class="html-embed__card is-frameless"><div id="frag-j0nqgk6qmbm"> |
| <div style="max-width: 940px; margin: 16px 0; border:1px solid #2a2f3a; border-radius:8px; background:#0b0f19; font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, 'Liberation Mono', 'Courier New', monospace; color:#e5e7eb;"> |
| <div style="display:flex; align-items:center; gap:8px; padding:8px 10px; border-bottom:1px solid #1f2430; background:#111827; border-top-left-radius:8px; border-top-right-radius:8px;"> |
| <span style="width:10px; height:10px; background:#ef4444; border-radius:50%; display:inline-block;"></span> |
| <span style="width:10px; height:10px; background:#f59e0b; border-radius:50%; display:inline-block;"></span> |
| <span style="width:10px; height:10px; background:#22c55e; border-radius:50%; display:inline-block;"></span> |
| <span style="margin-left:8px; font-size:12px; color:#9ca3af;">attention-mask-visualizer</span> |
| </div> |
| <div style="padding:12px 14px; overflow:auto; font-size:12.5px; line-height:1.4;"> |
| <pre style="margin:0; white-space:pre; tab-size:2;"> |
| ATTN MASK — GPT-2 (causal) |
| Tokens: [The, cat, sat, on, the, mat] |
| Legend: x = can attend, . = masked (future) |
| |
| The cat sat on the mat |
| The x |
| cat x x |
| sat x x x |
| on x x x x |
| the x x x x x |
| mat x x x x x x |
| |
| |
| ATTN MASK — PaliGemma-style (bidirectional prefix + causal suffix) |
| Prefix: [<i0> <i1> <i2> <i3> <i4> What is this] |
| Suffix: [A great duck] |
| Legend: ✓ = can attend, ✗ = cannot |
|
|
| <i0><i1><i2><i3><i4> What is this | A great duck |
| <i0> ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| <i1> ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| <i2> ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| <i3> ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| <i4> ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| What ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| is ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| this ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ ✗ |
| -------------------------------------------------------------------- |
| A ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ ✗ |
| great ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✗ |
| duck ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ ✓ |
| </pre> |
| </div> |
| </div> |
| </div></div></figure> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Uniform attention APIs enable cross-model diagnostics (e.g., PaliGemma prefix bidirectionality vs causal).</p><p><strong>Next:</strong> whole-model tracing for ports and regressions.</p> </div> </div> </div> </div> |
| <h3 id="logging-entire-model-activations"><a href="#logging-entire-model-activations">Logging entire model activations</a></h3> |
| <p>Because everything is PyTorch, we can easily <a href="https://huggingface.co/docs/transformers/internal/model_debugging_utils">debug any model</a> when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.</p> |
| <p>It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, to match our <span class="glossary-term " style="" data-tooltip-id="tenet-ws0h9w0zmra" onmouseenter="window.showTenetTooltip(event, 'tenet-ws0h9w0zmra')" onmousemove="window.updateTenetTooltip(event, 'tenet-ws0h9w0zmra')" onmouseleave="window.hideTenetTooltip('tenet-ws0h9w0zmra')">Source of Truth guideline</span><span id="tenet-ws0h9w0zmra" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">source-of-truth</span> <span class="glossary-tooltip__definition">Model implementations should be reliable, reproducible, and faithful to original performances.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>.</p> |
| <div class="wide"> <div class="ri-root" data-ri-root="ri_lag5ba2v8ic" data-has-caption data-astro-cid-6kov3kig> <figure class="" data-astro-cid-6kov3kig> <img src="/_astro/model_debugger.DouWEpKv_Z2338YE.webp" alt="Model debugger interface" data-zoomable="1" data-astro-cid-6kov3kig width="2053" height="1016" loading="lazy" decoding="async"> <figcaption data-astro-cid-6kov3kig> <span data-astro-cid-6kov3kig><strong>Figure 10:</strong> Model debugger interface intercepting calls and logging statistics in nested JSON.</span> </figcaption> </figure> </div> <script> |
| (() => { |
| const scriptEl = document.currentScript; |
| const root = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!root) { |
| console.log("Figure script: No root element found, exiting"); |
| return; |
| } |
| const img = |
| root.tagName === "IMG" |
| ? root |
| : root.querySelector |
| ? root.querySelector("img") |
| : null; |
| if (!img) { |
| console.log("Figure script: No img element found, exiting"); |
| return; |
| } |
| |
| |
| const ensureMediumZoomReady = (cb) => { |
| |
| if (window.mediumZoom) return cb(); |
| const retry = () => { |
| |
| if (window.mediumZoom) cb(); |
| else setTimeout(retry, 30); |
| }; |
| retry(); |
| }; |
| |
| const initZoomIfNeeded = () => { |
| if (img.getAttribute("data-zoomable") !== "1") return; |
| const isDark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| const background = isDark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)"; |
| ensureMediumZoomReady(() => { |
| |
| const instance = window.mediumZoom |
| ? window.mediumZoom(img, { |
| background, |
| margin: 24, |
| scrollOffset: 0, |
| container: { |
| top: 0, |
| right: 0, |
| bottom: 0, |
| left: 0 |
| } |
| }) |
| : null; |
| if (!instance) return; |
| let onScrollLike; |
| const attachCloseOnScroll = () => { |
| if (onScrollLike) return; |
| onScrollLike = () => { |
| try { |
| instance.close && instance.close(); |
| } catch {} |
| }; |
| window.addEventListener("wheel", onScrollLike, { passive: true }); |
| window.addEventListener("touchmove", onScrollLike, { passive: true }); |
| window.addEventListener("scroll", onScrollLike, { passive: true }); |
| }; |
| const detachCloseOnScroll = () => { |
| if (!onScrollLike) return; |
| window.removeEventListener("wheel", onScrollLike); |
| window.removeEventListener("touchmove", onScrollLike); |
| window.removeEventListener("scroll", onScrollLike); |
| onScrollLike = null; |
| }; |
| try { |
| instance.on && instance.on("open", attachCloseOnScroll); |
| } catch {} |
| try { |
| instance.on && instance.on("close", detachCloseOnScroll); |
| } catch {} |
| const themeObserver = new MutationObserver(() => { |
| const dark = |
| document.documentElement.getAttribute("data-theme") === "dark"; |
| try { |
| instance.update && |
| instance.update({ |
| background: dark ? "rgba(0,0,0,.9)" : "rgba(0,0,0,.85)", |
| }); |
| } catch {} |
| }); |
| themeObserver.observe(document.documentElement, { |
| attributes: true, |
| attributeFilter: ["data-theme"], |
| }); |
| }); |
| }; |
| |
| |
| const setupGlobalZoomBehavior = () => { |
| img.addEventListener("click", () => { |
| if (img.getAttribute("data-zoomable") === "1") { |
| |
| document |
| .querySelectorAll(".ri-root.zoom-active") |
| .forEach((el) => el.classList.remove("zoom-active")); |
| |
| |
| root.classList.add("zoom-active"); |
| } |
| }); |
| }; |
| |
| |
| const dlBtn = root.querySelector ? root.querySelector(".img-dl-btn") : null; |
| if (dlBtn) { |
| dlBtn.addEventListener("click", async (ev) => { |
| try { |
| ev.preventDefault(); |
| ev.stopPropagation(); |
| const pickHrefAndName = () => { |
| const current = img.currentSrc || img.src || ""; |
| let href = img.getAttribute("data-download-src") || current; |
| const deriveName = () => { |
| try { |
| const u = new URL(current, location.href); |
| const rawHref = u.searchParams.get("href"); |
| const candidate = rawHref |
| ? decodeURIComponent(rawHref) |
| : u.pathname; |
| const last = String(candidate).split("/").pop() || ""; |
| const base = last.split("?")[0].split("#")[0]; |
| const m = base.match( |
| /^(.+?\.(?:png|jpe?g|webp|avif|gif|svg))(?:[._-].*)?$/i, |
| ); |
| if (m && m[1]) return m[1]; |
| return base || "image"; |
| } catch { |
| return "image"; |
| } |
| }; |
| const name = img.getAttribute("data-download-name") || deriveName(); |
| return { href, name }; |
| }; |
| const picked = pickHrefAndName(); |
| const res = await fetch(picked.href, { credentials: "same-origin" }); |
| const blob = await res.blob(); |
| const objectUrl = URL.createObjectURL(blob); |
| const tmp = document.createElement("a"); |
| tmp.href = objectUrl; |
| tmp.download = picked.name || "image"; |
| tmp.target = "_self"; |
| tmp.rel = "noopener"; |
| tmp.style.display = "none"; |
| document.body.appendChild(tmp); |
| tmp.click(); |
| setTimeout(() => { |
| URL.revokeObjectURL(objectUrl); |
| tmp.remove(); |
| }, 1000); |
| } catch {} |
| }); |
| } |
| |
| |
| setupGlobalZoomBehavior(); |
| |
| if (document.readyState === "complete") initZoomIfNeeded(); |
| else window.addEventListener("load", initZoomIfNeeded, { once: true }); |
| })(); |
| </script> </div> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Forward interception and nested JSON logging align ports to reference implementations, reinforcing “Source of Truth.”</p><p><strong>Next:</strong> CUDA warmup reduces load-time without touching modeling semantics.</p> </div> </div> </div> </div> |
| <h3 id="cooking-faster-cuda-warmups"><a href="#cooking-faster-cuda-warmups">Cooking faster CUDA warmups</a></h3> |
| <p>Having a clean <em>external</em> API allows us to work on the <span class="glossary-term " style="" data-tooltip-id="tenet-8kzapkt6pz3" onmouseenter="window.showTenetTooltip(event, 'tenet-8kzapkt6pz3')" onmousemove="window.updateTenetTooltip(event, 'tenet-8kzapkt6pz3')" onmouseleave="window.hideTenetTooltip('tenet-8kzapkt6pz3')">true inner workings of transformers</span><span id="tenet-8kzapkt6pz3" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">code-is-product</span> <span class="glossary-tooltip__definition">Optimize for reading, diffing, and tweaking. Code quality matters as much as functionality.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>. One of a few recent additions was the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code>, which dramatically improved loading times by pre-allocating GPU memory to avoid malloc bottlenecks during model loading. It can achieve a 7x speedup factor for an 8B model, or 6x for a 32B one, as you can check in <a href="https://github.com/huggingface/transformers/pull/36380">the PR</a>!</p> |
| <div class="wide"> <figure class="html-embed"><figcaption class="html-embed__title" style="text-align:center">Mem allocation patterns during model loading</figcaption><div class="html-embed__card"><div id="frag-dkb8xuhhm3"><div class="d3-warmup-demo"></div> |
| <style> |
| .d3-warmup-demo { |
| font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; |
| margin: 0; |
| padding: 0px; |
| background-color: transparent; |
| } |
| |
| .d3-warmup-demo .container { |
| max-width: 1200px; |
| margin: 0 auto; |
| background: transparent; |
| border-radius: 0; |
| padding: 0; |
| box-shadow: none; |
| } |
| |
| |
| .d3-warmup-demo .demo-container { |
| display: flex; |
| gap: 20px; |
| margin-bottom: 0px; |
| } |
| |
| .d3-warmup-demo .side { |
| flex: 1; |
| border: 2px solid var(--border-color); |
| border-radius: 8px; |
| padding: 20px; |
| background: var(--page-bg, var(--surface-bg)); |
| } |
| |
| |
| @media (max-width: 768px) { |
| .d3-warmup-demo .demo-container { |
| flex-direction: column; |
| gap: 15px; |
| } |
| |
| .d3-warmup-demo .side { |
| padding: 15px; |
| } |
| |
| .d3-warmup-demo .memory-area { |
| height: 150px; |
| padding: 8px; |
| } |
| |
| .d3-warmup-demo .layer-box { |
| width: 60px; |
| height: 25px; |
| margin: 2px; |
| } |
| |
| .d3-warmup-demo .layer-box::after { |
| font-size: 8px; |
| } |
| |
| .d3-warmup-demo .warmup-container { |
| height: 50px; |
| margin-bottom: 15px; |
| border: 2px solid var(--primary-color); |
| } |
| |
| .d3-warmup-demo .warmup-container::before { |
| font-size: 12px; |
| } |
| |
| .d3-warmup-demo .warmup-fill::after { |
| font-size: 10px; |
| } |
| |
| .d3-warmup-demo .timing { |
| font-size: 20px; |
| } |
| |
| .d3-warmup-demo .layer-counter { |
| font-size: 14px; |
| } |
| |
| .d3-warmup-demo .btn { |
| padding: 12px 24px; |
| font-size: 14px; |
| margin-right: 8px; |
| margin-bottom: 8px; |
| } |
| |
| .d3-warmup-demo .description { |
| padding: 12px; |
| font-size: 13px; |
| } |
| |
| .d3-warmup-demo .phase-indicator { |
| font-size: 12px; |
| } |
| } |
| |
| @media (max-width: 480px) { |
| .d3-warmup-demo .side { |
| padding: 12px; |
| } |
| |
| .d3-warmup-demo .memory-area { |
| height: 120px; |
| padding: 6px; |
| } |
| |
| .d3-warmup-demo .layer-box { |
| width: 45px; |
| height: 20px; |
| margin: 1px; |
| } |
| |
| .d3-warmup-demo .layer-box::after { |
| font-size: 7px; |
| } |
| |
| .d3-warmup-demo .warmup-container { |
| height: 40px; |
| margin-bottom: 12px; |
| border: 2px solid var(--primary-color); |
| } |
| |
| .d3-warmup-demo .warmup-container::before { |
| font-size: 10px; |
| } |
| |
| .d3-warmup-demo .warmup-fill::after { |
| font-size: 9px; |
| } |
| |
| .d3-warmup-demo .timing { |
| font-size: 18px; |
| } |
| |
| .d3-warmup-demo .layer-counter { |
| font-size: 13px; |
| } |
| |
| .d3-warmup-demo .btn { |
| padding: 10px 20px; |
| font-size: 13px; |
| margin-right: 6px; |
| margin-bottom: 6px; |
| width: calc(50% - 3px); |
| } |
| |
| .d3-warmup-demo .description { |
| padding: 10px; |
| font-size: 12px; |
| } |
| |
| .d3-warmup-demo .phase-indicator { |
| font-size: 11px; |
| } |
| } |
| |
| .d3-warmup-demo .memory-area { |
| height: 200px; |
| border: 2px dashed var(--border-color); |
| border-radius: 6px; |
| padding: 10px; |
| margin: 20px 0; |
| background: var(--page-bg, var(--surface-bg)); |
| position: relative; |
| overflow: hidden; |
| } |
| |
| .d3-warmup-demo .layer-box { |
| width: 80px; |
| height: 30px; |
| border: 2px solid var(--text-color); |
| border-radius: 4px; |
| margin: 3px; |
| display: inline-block; |
| position: relative; |
| background: transparent; |
| transition: all .3s ease; |
| } |
| |
| .d3-warmup-demo .layer-box.allocating { |
| background: var(--muted-color); |
| border-color: var(--border-color); |
| opacity: 0.3; |
| } |
| |
| .d3-warmup-demo .layer-box.allocating::after { |
| content: "malloc"; |
| position: absolute; |
| inset: 0; |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| font-size: 10px; |
| color: var(--muted-color); |
| font-weight: bold; |
| } |
| |
| .d3-warmup-demo .layer-box.loaded { |
| background: var(--good-color, #198754); |
| border-color: var(--good-color, #198754); |
| opacity: 0.8; |
| } |
| |
| .d3-warmup-demo .layer-box.loaded::after { |
| content: "data"; |
| position: absolute; |
| inset: 0; |
| display: flex; |
| align-items: center; |
| justify-content: center; |
| font-size: 10px; |
| color: white; |
| font-weight: bold; |
| } |
| |
| .d3-warmup-demo .warmup-container { |
| width: 100%; |
| height: 60px; |
| border: 2px solid var(--primary-color); |
| border-radius: 6px; |
| margin-bottom: 20px; |
| background: transparent; |
| position: relative; |
| overflow: hidden; |
| } |
| |
| .d3-warmup-demo .warmup-container.allocated { |
| border-color: var(--warmup-color); |
| background: var(--warmup-color); |
| opacity: 1; |
| } |
| |
| .d3-warmup-demo .warmup-container::before { |
| content: "Pre-allocated Memory Pool"; |
| position: absolute; |
| top: 50%; |
| left: 50%; |
| transform: translate(-50%, -50%); |
| font-size: 14px; |
| color: var(--muted-color); |
| font-weight: bold; |
| z-index: 1; |
| } |
| |
| .d3-warmup-demo .warmup-container.allocated::before { |
| color: white; |
| } |
| |
| .d3-warmup-demo .warmup-fill { |
| height: 100%; |
| width: 0%; |
| transition: width .5s ease; |
| border-radius: 3px; |
| position: relative; |
| z-index: 2; |
| background: var(--good-color, #198754); |
| opacity: 0.8; |
| } |
| |
| .d3-warmup-demo .warmup-fill::after { |
| content: "Layer Data Loading"; |
| position: absolute; |
| top: 50%; |
| left: 50%; |
| transform: translate(-50%, -50%); |
| font-size: 12px; |
| color: white; |
| font-weight: bold; |
| white-space: nowrap; |
| } |
| |
| .d3-warmup-demo .timing { |
| text-align: center; |
| font-size: 24px; |
| font-weight: bold; |
| margin: 15px 0; |
| min-height: 30px; |
| color: var(--text-color); |
| } |
| |
| .d3-warmup-demo .no-warmup .timing { |
| color: var(--bad-color, #d63384); |
| } |
| |
| .d3-warmup-demo .with-warmup .timing { |
| color: var(--good-color, #198754); |
| } |
| |
| .d3-warmup-demo .controls { |
| text-align: center; |
| margin: 10px 0 30px 0; |
| } |
| |
| .d3-warmup-demo .btn { |
| background: var(--primary-color); |
| color: white; |
| border: none; |
| padding: 16px 32px; |
| border-radius: 6px; |
| font-size: 16px; |
| margin-right: 10px; |
| cursor: pointer; |
| transition: background .3s ease; |
| } |
| |
| .d3-warmup-demo .btn:hover { |
| background: var(--primary-color); |
| opacity: 0.8; |
| } |
| |
| .d3-warmup-demo .btn:disabled { |
| background: var(--muted-color); |
| cursor: not-allowed; |
| opacity: 0.5; |
| } |
| |
| .d3-warmup-demo .description { |
| background: transparent; |
| padding: 15px; |
| border-radius: 6px; |
| margin-top: 15px; |
| font-size: 14px; |
| line-height: 1.5; |
| color: var(--text-color); |
| } |
| |
| .d3-warmup-demo .color-square { |
| display: inline-block; |
| width: 12px; |
| height: 12px; |
| border-radius: 2px; |
| margin-right: 6px; |
| vertical-align: middle; |
| } |
| |
| .d3-warmup-demo .color-square.warmup-color { |
| background: var(--warmup-color); |
| } |
| |
| .d3-warmup-demo .color-square.good-color { |
| background: var(--good-color); |
| } |
| |
| .d3-warmup-demo .phase-indicator { |
| font-size: 14px; |
| color: var(--muted-color); |
| text-align: center; |
| margin-top: 10px; |
| min-height: 20px; |
| } |
| |
| .d3-warmup-demo .layer-counter { |
| text-align: center; |
| font-size: 16px; |
| color: var(--text-color); |
| margin: 10px 0; |
| } |
| </style> |
|
|
| <script> |
| (() => { |
| const bootstrap = () => { |
| const scriptEl = document.currentScript; |
| let container = scriptEl ? scriptEl.previousElementSibling : null; |
| if (!(container && container.classList && container.classList.contains('d3-warmup-demo'))) { |
| const candidates = Array.from(document.querySelectorAll('.d3-warmup-demo')) |
| .filter((el) => !(el.dataset && el.dataset.mounted === 'true')); |
| container = candidates[candidates.length - 1] || null; |
| } |
| if (!container) return; |
| if (container.dataset) { |
| if (container.dataset.mounted === 'true') return; |
| container.dataset.mounted = 'true'; |
| } |
| |
| |
| const getColors = () => { |
| if (window.ColorPalettes) { |
| const primary = window.ColorPalettes.getPrimary(); |
| const categorical = window.ColorPalettes.getColors('categorical', 3); |
| return { |
| primary: primary, |
| good: '#198754', |
| bad: '#dc3545', |
| warmup: primary |
| }; |
| } |
| return { |
| primary: 'var(--primary-color)', |
| good: '#198754', |
| bad: '#dc3545', |
| warmup: 'var(--primary-color)' |
| }; |
| }; |
| |
| |
| container.innerHTML = ` |
| <div class="container"> |
| <div class="controls"> |
| <button class="btn" id="startBtn">Start Animation</button> |
| <button class="btn" id="resetBtn">Reset</button> |
| </div> |
| |
| <div class="demo-container"> |
| <div class="side no-warmup"> |
| <div class="timing" id="noWarmupTime">0.00s</div> |
| <div class="layer-counter" id="noWarmupCounter">Layers loaded: 0/10</div> |
| <div class="phase-indicator" id="noWarmupPhase"></div> |
| <div class="memory-area" id="noWarmupArea"></div> |
| <div class="description"> |
| <strong>Individual Allocations:</strong><br> |
| Each model layer triggers a separate cudaMalloc() call, creating memory fragmentation and allocation overhead. |
| <br><br> |
| 📦 <strong>Grey "malloc"</strong> = Memory allocation overhead<br> |
| ✅ <strong>Green "data"</strong> = Actual layer data loading |
| </div> |
| </div> |
| |
| <div class="side with-warmup"> |
| <div class="timing" id="warmupTime">0.00s</div> |
| <div class="layer-counter" id="warmupCounter">Layers loaded: 0/10</div> |
| <div class="phase-indicator" id="warmupPhase"></div> |
| <div class="memory-area" id="warmupArea"> |
| <div class="warmup-container" id="warmupContainer"> |
| <div class="warmup-fill" id="warmupFill"></div> |
| </div> |
| <div id="warmupLayers"></div> |
| </div> |
| <div class="description"> |
| <strong>Pre-allocated Pool:</strong><br> |
| The warmup function calculates total memory needed and makes ONE large allocation. Subsequent layers load directly into this pool, eliminating malloc overhead. |
| <br><br> |
| <span class="color-square warmup-color"></span> <strong>Container</strong> = Single large malloc (warmup)<br> |
| <span class="color-square good-color"></span> <strong>Progress bar</strong> = Layer data loading (no malloc needed) |
| </div> |
| </div> |
| </div> |
| </div> |
| `; |
| |
| |
| const colors = getColors(); |
| const style = document.createElement('style'); |
| style.textContent = ` |
| .d3-warmup-demo .no-warmup h2 { color: ${colors.bad}; } |
| .d3-warmup-demo .with-warmup h2 { color: ${colors.good}; } |
| .d3-warmup-demo .no-warmup .timing { color: ${colors.bad}; } |
| .d3-warmup-demo .with-warmup .timing { color: ${colors.good}; } |
| .d3-warmup-demo { |
| --warmup-color: ${colors.warmup}; |
| --good-color: ${colors.good}; |
| --bad-color: ${colors.bad}; |
| --primary-color: ${colors.primary}; |
| } |
| .d3-warmup-demo .layer-box.loaded { background: ${colors.good}; border-color: ${colors.good}; } |
| .d3-warmup-demo .warmup-container.allocated { border-color: ${colors.warmup}; background: ${colors.warmup}; } |
| .d3-warmup-demo .warmup-container.allocated::before { color: white; } |
| .d3-warmup-demo .warmup-fill { background: ${colors.good}; } |
| .d3-warmup-demo .color-square.warmup-color { background: ${colors.warmup}; } |
| .d3-warmup-demo .color-square.good-color { background: ${colors.good}; } |
| .d3-warmup-demo .btn { background: ${colors.primary}; } |
| `; |
| container.appendChild(style); |
| |
| |
| let animationSpeed = 1 / 2.4; |
| let isRunning = false; |
| const totalLayers = 10; |
| |
| function startDemo() { |
| if (isRunning) return; |
| isRunning = true; |
| |
| document.getElementById('startBtn').disabled = true; |
| document.getElementById('resetBtn').disabled = true; |
| |
| Promise.all([ |
| animateNoWarmup(), |
| animateWithWarmup() |
| ]).then(() => { |
| isRunning = false; |
| document.getElementById('startBtn').disabled = false; |
| document.getElementById('resetBtn').disabled = false; |
| }); |
| } |
| |
| function resetDemo() { |
| if (isRunning) return; |
| |
| document.getElementById('noWarmupArea').innerHTML = ''; |
| document.getElementById('warmupLayers').innerHTML = ''; |
| document.getElementById('warmupFill').style.width = '0%'; |
| document.getElementById('warmupContainer').classList.remove('allocated'); |
| |
| document.getElementById('noWarmupTime').textContent = '0.00s'; |
| document.getElementById('warmupTime').textContent = '0.00s'; |
| |
| document.getElementById('noWarmupCounter').textContent = 'Layers loaded: 0/10'; |
| document.getElementById('warmupCounter').textContent = 'Layers loaded: 0/10'; |
| |
| document.getElementById('noWarmupPhase').textContent = ''; |
| document.getElementById('warmupPhase').textContent = ''; |
| } |
| |
| async function animateNoWarmup() { |
| const container = document.getElementById('noWarmupArea'); |
| const timeEl = document.getElementById('noWarmupTime'); |
| const counterEl = document.getElementById('noWarmupCounter'); |
| const phaseEl = document.getElementById('noWarmupPhase'); |
| |
| let currentTime = 0; |
| const baseDelay = 200 / animationSpeed; |
| |
| phaseEl.textContent = 'Loading model layers...'; |
| |
| for (let i = 0; i < totalLayers; i++) { |
| const layerBox = document.createElement('div'); |
| layerBox.className = 'layer-box'; |
| container.appendChild(layerBox); |
| |
| await sleep(baseDelay * 0.3); |
| layerBox.classList.add('allocating'); |
| currentTime += 0.08; |
| timeEl.textContent = currentTime.toFixed(2) + 's'; |
| |
| await sleep(baseDelay * 0.7); |
| layerBox.classList.remove('allocating'); |
| layerBox.classList.add('loaded'); |
| currentTime += 0.12; |
| timeEl.textContent = currentTime.toFixed(2) + 's'; |
| |
| counterEl.textContent = `Layers loaded: ${i + 1}/${totalLayers}`; |
| } |
| |
| phaseEl.textContent = 'Complete!'; |
| } |
| |
| async function animateWithWarmup() { |
| const container = document.getElementById('warmupLayers'); |
| const timeEl = document.getElementById('warmupTime'); |
| const counterEl = document.getElementById('warmupCounter'); |
| const phaseEl = document.getElementById('warmupPhase'); |
| const warmupContainer = document.getElementById('warmupContainer'); |
| const warmupFill = document.getElementById('warmupFill'); |
| |
| let currentTime = 0; |
| const baseDelay = 200 / animationSpeed; |
| |
| phaseEl.textContent = 'Warming up allocator...'; |
| await sleep(baseDelay * 2); |
| warmupContainer.classList.add('allocated'); |
| currentTime += 0.3; |
| timeEl.textContent = currentTime.toFixed(2) + 's'; |
| |
| phaseEl.textContent = 'Loading model layers...'; |
| |
| for (let i = 0; i < totalLayers; i++) { |
| const layerBox = document.createElement('div'); |
| layerBox.className = 'layer-box loaded'; |
| |
| if (window.innerWidth <= 480) { |
| layerBox.style.width = '35px'; |
| layerBox.style.height = '18px'; |
| } else if (window.innerWidth <= 768) { |
| layerBox.style.width = '40px'; |
| layerBox.style.height = '20px'; |
| } else { |
| layerBox.style.width = '40px'; |
| layerBox.style.height = '20px'; |
| } |
| container.appendChild(layerBox); |
| |
| const progress = ((i + 1) / totalLayers) * 100; |
| warmupFill.style.width = progress + '%'; |
| |
| await sleep(baseDelay * 0.5); |
| currentTime += 0.08; |
| timeEl.textContent = currentTime.toFixed(2) + 's'; |
| |
| counterEl.textContent = `Layers loaded: ${i + 1}/${totalLayers}`; |
| } |
| |
| phaseEl.textContent = 'Complete!'; |
| } |
| |
| function sleep(ms) { |
| return new Promise(resolve => setTimeout(resolve, ms)); |
| } |
| |
| |
| const handleResize = () => { |
| |
| const warmupLayers = document.getElementById('warmupLayers'); |
| if (warmupLayers) { |
| const layerBoxes = warmupLayers.querySelectorAll('.layer-box'); |
| layerBoxes.forEach(layerBox => { |
| if (window.innerWidth <= 480) { |
| layerBox.style.width = '35px'; |
| layerBox.style.height = '18px'; |
| } else if (window.innerWidth <= 768) { |
| layerBox.style.width = '40px'; |
| layerBox.style.height = '20px'; |
| } else { |
| layerBox.style.width = '40px'; |
| layerBox.style.height = '20px'; |
| } |
| }); |
| } |
| }; |
| |
| |
| document.getElementById('startBtn').addEventListener('click', startDemo); |
| document.getElementById('resetBtn').addEventListener('click', resetDemo); |
| window.addEventListener('resize', handleResize); |
| }; |
| |
| if (document.readyState === 'loading') { |
| document.addEventListener('DOMContentLoaded', bootstrap, { once: true }); |
| } else { |
| bootstrap(); |
| } |
| })(); |
| </script> |
| </div></div></figure> </div> |
| <p>It’s hard to overstate how much of a lifesaver that is when you’re trying to load a model as fast as possible, as it’s the narrowest bottleneck for your iteration speed.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Pre-allocating GPU memory removes malloc spikes (e.g., 7× for 8B, 6× for 32B in the referenced PR).</p><p><strong>Next:</strong> consistent interfaces allow transformers-serve.</p> </div> </div> </div> </div> |
| <h3 id="transformers-serve-and-continuous-batching"><a href="#transformers-serve-and-continuous-batching">Transformers-serve and continuous batching</a></h3> |
| <p>Having all these models readily available and sharing the same interface allowed us to implement transformers-serve, a CLI tool to expose models through a standard OpenAI http API.</p> |
| <div class="code-card"><button class="code-copy button--ghost" type="button" aria-label="Copy code"><svg viewBox="0 0 24 24" aria-hidden="true" focusable="false"><path d="M16 1H4c-1.1 0-2 .9-2 2v12h2V3h12V1zm3 4H8c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h11c1.1 0 2-.9 2-2V7c0-1.1-.9-2-2-2zm0 16H8V7h11v14z"></path></svg></button><pre class="astro-code astro-code-themes github-light github-dark" style="--shiki-light:#24292e;--shiki-dark:#e1e4e8;--shiki-light-bg:#fff;--shiki-dark-bg:#24292e;overflow-x:auto" tabindex="0" data-language="bash"><code><span class="line"><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">transformers</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> serve</span></span> |
| <span class="line"></span> |
| <span class="line"><span style="--shiki-light:#6F42C1;--shiki-dark:#B392F0">curl</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> -X</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> POST</span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF"> http://localhost:8000/v1/chat/completions</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> \</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">-H </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">"Content-Type: application/json"</span><span style="--shiki-light:#005CC5;--shiki-dark:#79B8FF"> \</span></span> |
| <span class="line"><span style="--shiki-light:#24292E;--shiki-dark:#E1E4E8">-d </span><span style="--shiki-light:#032F62;--shiki-dark:#9ECBFF">'{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'</span></span> |
| <span class="line"></span></code></pre></div> |
| <p><code>transformers-serve</code> uses continuous batching (see <a href="https://github.com/huggingface/transformers/pull/38085">this PR</a> and also <a href="https://github.com/huggingface/transformers/pull/40426">this one</a>) for better GPU utilization, and is very much linked to the great work of vLLM with the <code>paged attention kernel</code> – a further justification of <a href="#community-kernels">external kernels</a>.</p> |
| <p><code>transformers-serve</code> is not meant for user-facing production services, tools like vLLM or SGLang are super optimized for that, but it’s useful for several use cases:</p> |
| <ul> |
| <li>Quickly verify that your model is compatible with continuous batching and paged attention.</li> |
| <li>Run ad-hoc vibe tests on any model, without worrying to deploy anything.</li> |
| <li>Run evaluations efficiently, again without having to spend a lot of time engineering your infrastructure.</li> |
| </ul> |
| <p>For model deployment, check <a href="https://huggingface.co/docs/inference-providers/en/index">Inference Providers</a> or roll your solution using any of the excellent serving libraries.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>OpenAI-compatible surface + continuous batching; kernels/backends slot in because the modeling API stayed stable.</p><p><strong>Next:</strong> reuse across vLLM/SGLang relies on the same consistency.</p> </div> </div> </div> </div> |
| <h2 id="community-reusability"><a href="#community-reusability">Community reusability</a></h2> |
| <p>The transformers-serve CLI built on transformers, for sure, but the library is made first and foremost to be <em>reused</em> at large by the open-source ecosystem.</p> |
| <p>Adding a model to transformers means:</p> |
| <ul> |
| <li>having it immediately available to the community</li> |
| <li>having it immediately usable in vLLM, <a href="https://huggingface.co/blog/transformers-backend-sglang">SGLang</a>, and so on without additional code. In the case of vLLM, transformers was added as a backend to run models on vLLM, which optimizes throughput/latency on top of <em>existing</em> transformers architectures <a href="https://blog.vllm.ai/2025/04/11/transformers-backend.html">as seen in this great vLLM x HF blog post.</a></li> |
| <li>being the reference code for implementations in MLX, llama.cpp and other libraries.</li> |
| </ul> |
| <p>This further cements the need for a <span class="glossary-term " style="" data-tooltip-id="tenet-ytnt51w7pph" onmouseenter="window.showTenetTooltip(event, 'tenet-ytnt51w7pph')" onmousemove="window.updateTenetTooltip(event, 'tenet-ytnt51w7pph')" onmouseleave="window.hideTenetTooltip('tenet-ytnt51w7pph')">consistent public surface</span><span id="tenet-ytnt51w7pph" class="glossary-tooltip" data-position="top"> <span class="glossary-tooltip__content"> <span class="glossary-tooltip__term">consistent-public-surface</span> <span class="glossary-tooltip__definition">Uniform naming, signatures, and conventions across all models for predictability.</span> </span> <span class="glossary-tooltip__arrow"></span> </span> <script> |
| if (!window.tenetTooltipInitialized) { |
| window.tenetTooltipInitialized = true; |
| |
| window.showTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'visible'; |
| tooltip.style.opacity = '1'; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.updateTenetTooltip = function(event, id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.top = (event.clientY + 10) + 'px'; |
| tooltip.style.left = (event.clientX + 10) + 'px'; |
| }; |
| |
| window.hideTenetTooltip = function(id) { |
| const tooltip = document.getElementById(id); |
| if (!tooltip) return; |
| tooltip.style.visibility = 'hidden'; |
| tooltip.style.opacity = '0'; |
| }; |
| } |
| </script>: we are a backend and a reference, and there’s more software than us to handle serving. At the time of writing, more effort is done in that direction. We already have compatible configs for VLMs for vLLM (say that three times fast), check <a href="https://github.com/huggingface/transformers/pull/40696/files">here for GLM4 video support</a>, and here for <a href="https://github.com/huggingface/transformers/pull/40132">MoE support</a>, for instance.</p> |
| <div class="note note--info" data-astro-cid-qg6lmfty> <div class="note__layout" data-astro-cid-qg6lmfty> <div class="note__body" data-astro-cid-qg6lmfty> <div class="note__content" data-astro-cid-qg6lmfty> <p>Being a good backend consumer requires a consistent public surface; modular shards and configs make that stability practical.</p><p><strong>Next:</strong> what changes in v5 without breaking the promise of visible semantics.</p> </div> </div> </div> </div> |
| <h2 id="a-pact-with-the-community-and-what-is-coming-next"><a href="#a-pact-with-the-community-and-what-is-coming-next">A Pact with the Community and what is coming next</a></h2> |
| <p>The next major version of <code>transformers</code> is just around the corner (and will have another blog post to its name when it comes out). When v5 is released, we aim to keep <a href="#backwards-compatibility">backwards compatibility</a> as solid as possible. The changes we make now are in service of that goal.</p> |
| <p>We will lean further into a modular toolbox, not a framework. You should not be forced to rewrite modeling code. It’s better when a model can inherit from <code>PreTrainedModel</code> and opt into Tensor Parallel, <code>from_pretrained</code>, sharding, <code>push_to_hub</code>, loss plumbing, and external stacks like PEFT/TRL/SGLang/vLLM.</p> |
| <p>We wrote this to make our design philosophy legible. Transformers is built by thousands of contributors, but it only stays usable if its core principles are explicit and upheld. These tenets are our pact with you: they ensure that whether you are shipping a new model, contributing an optimized kernel, or simply debugging a forward pass, the code remains transparent and hackable.</p> |
| <p>This is a living document, not a stone tablet. Tell us where these tenets fall short or should evolve next. We’ll keep working, and we’ll be here to share the journey with you all.</p> </main> </section> <footer class="footer"> <div class="footer-inner"> <section class="citation-block"> <h3>Citation</h3> <p>For attribution, cite this work as</p> <pre class="citation short">Pablo Montalvo, Lysandre Debut, Pedro Cuenca, Yoni Gozlan (2025). "Maintain the unmaintainable: 1M python loc, 400+ models".</pre> <p>BibTeX citation</p> <pre class="citation long">@misc{montalvo2025_maintain_the_unmaintaina, |
| title={Maintain the unmaintainable: 1M python loc, 400+ models}, |
| author={Pablo Montalvo and Lysandre Debut and Pedro Cuenca and Yoni Gozlan}, |
| year={2025}, |
| |
| }</pre> </section> <section class="acknowledgements-block"> <h3>Acknowledgements</h3> <p>Special thanks to all the reviewers on this! <a href='https://huggingface.co/reach-vb '>Vaibhav Srivastav</a>, <a href='https://huggingface.co/cyrilvallez '>Cyril Vallez</a>, <a href='https://huggingface.co/yonigozlan'>Yoni Gozlan</a> also for his excellent work on fast image processors, <a href='https://huggingface.co/ArthurZ'>Arthur Zucker</a> for his guidance, and of course the wonderful <a href='https://huggingface.co/tfrere'>Thibaud Frere</a> for designing this template and helping me out with it!<br><br>Most importantly: thanks to the entire Open-Source community, sincerely.</p> </section> <section class="references-block"> </section> </div> </footer> <script> |
| (() => { |
| const getFooter = () => document.currentScript?.closest('footer') || document.querySelector('footer.footer'); |
| const footer = getFooter(); |
| if (!footer) return; |
| const target = footer.querySelector('.references-block'); |
| if (!target) return; |
| |
| const contentRoot = document.querySelector('section.content-grid main') || document.querySelector('main') || document.body; |
| |
| const ensureHeading = (text) => { |
| const exists = Array.from(target.children).some((c) => c.tagName === 'H3' && c.textContent.trim().toLowerCase() === text.toLowerCase()); |
| if (!exists) { |
| const h = document.createElement('h3'); |
| h.textContent = text; |
| target.appendChild(h); |
| } |
| }; |
| |
| const moveIntoFooter = (element, headingText) => { |
| if (!element) return false; |
| |
| const firstHeading = element.querySelector(':scope > h1, :scope > h2, :scope > h3'); |
| if (firstHeading) { |
| const txt = (firstHeading.textContent || '').trim().toLowerCase(); |
| const targetTxt = headingText.trim().toLowerCase(); |
| if (txt === targetTxt || txt.includes('reference') || txt.includes('bibliograph')) { |
| firstHeading.remove(); |
| } |
| } |
| ensureHeading(headingText); |
| target.appendChild(element); |
| return true; |
| }; |
| const run = () => { |
| const findFirstOutsideFooter = (selectors) => { |
| for (const sel of selectors) { |
| const el = contentRoot.querySelector(sel); |
| if (el && !footer.contains(el)) return el; |
| } |
| return null; |
| }; |
| |
| const referencesEl = findFirstOutsideFooter(['#references', '.references', '.bibliography']); |
| const footnotesEl = findFirstOutsideFooter(['.footnotes']); |
| |
| const movedRefs = moveIntoFooter(referencesEl, 'References'); |
| const movedNotes = moveIntoFooter(footnotesEl, 'Footnotes'); |
| return movedRefs || movedNotes; |
| }; |
| |
| |
| const done = run(); |
| if (!done) { |
| const onReady = () => run(); |
| if (document.readyState === 'loading') { |
| document.addEventListener('DOMContentLoaded', onReady, { once: true }); |
| } else { |
| setTimeout(onReady, 0); |
| } |
| } |
| |
| |
| |
| })(); |
| </script> </body> </html> |