| """Encoder-specific render module. |
| |
| Reuses parent's format functions (fraud bar, top-k predictions, timeline) |
| directly via import — those are model-agnostic. The encoder demo only |
| adds two pieces of new content: |
| |
| 1. `render_why_encoder()` — the architectural pitch for the encoder pattern |
| 2. `render_encoder_integration()` — build-it-yourself integration guide |
| |
| The fraud / merchant / amount / mcc / timeline / profile formatters are |
| unchanged from parent. We import them so the encoder demo's prediction |
| cards are visually consistent. |
| """ |
|
|
| from __future__ import annotations |
|
|
| |
| |
| from src.demo.render import ( |
| format_amount_predictions, |
| format_fraud_score, |
| format_mcc_predictions, |
| format_merchant_predictions, |
| format_timeline, |
| format_topk_predictions, |
| ) |
|
|
| |
| _TEXT = "#171717" |
| _TEXT_MUTED = "#525252" |
| _TEXT_DIM = "#737373" |
| _BG_CARD = "#ffffff" |
| _BG_CARD_ALT = "#fafafa" |
| _BORDER = "rgba(0,0,0,0.1)" |
| _BORDER_SUBTLE = "rgba(0,0,0,0.05)" |
| _ACCENT_GREEN = "#10B981" |
| _ACCENT_BLUE = "#3B82F6" |
| _ACCENT_AMBER = "#F59E0B" |
| _ACCENT_PURPLE = "#7c3aed" |
| _RADIUS_CARD = "16px" |
| _RADIUS_SM = "8px" |
| _FONT_MONO = "JetBrains Mono, ui-monospace, SFMono-Regular, monospace" |
|
|
| |
| |
| |
| _CONTAINER_WIDTH = "1180px" |
|
|
|
|
| def render_why_encoder() -> str: |
| """Why Liquid tab content. |
| |
| Opens with the buyer's problem (multi-customer / multi-task |
| transaction-FM economics), then the published precedent |
| (LFM2.5-Audio / LFM2.5-VL), then the architectural and operational |
| properties, then scope-of-claim. Written for an external audience — |
| no internal codenames, no design-log register, no "we claim" framing. |
| """ |
|
|
| def _table_header(cols: list[str]) -> str: |
| ths = "" |
| for i, c in enumerate(cols): |
| align = "right" if i > 0 else "left" |
| ths += ( |
| f'<th style="padding: 6px 10px; font-size: 10px; color: {_TEXT_DIM};' |
| f' text-transform: uppercase; letter-spacing: 0.05em; text-align: {align};' |
| f' font-weight: 600;">{c}</th>' |
| ) |
| return f"<tr style='border-bottom: 1px solid {_BORDER};'>{ths}</tr>" |
|
|
| def _table_row(cells: list[str], highlight: bool = False) -> str: |
| bg = "background: rgba(16,185,129,0.05);" if highlight else "" |
| tds = "" |
| for i, c in enumerate(cells): |
| align = "right" if i > 0 else "left" |
| tds += ( |
| f'<td style="padding: 6px 10px; font-family: {_FONT_MONO}; font-size: 11px;' |
| f' color: {_TEXT}; text-align: {align};">{c}</td>' |
| ) |
| return f"<tr style='border-bottom: 1px solid {_BORDER_SUBTLE}; {bg}'>{tds}</tr>" |
|
|
| return f""" |
| <div style="max-width: {_CONTAINER_WIDTH}; margin: 0 auto; padding: 16px; |
| font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, sans-serif;"> |
| |
| <!-- Lead --> |
| <h2 style="margin: 0 0 4px 0; color: {_TEXT}; font-size: 22px; font-weight: 700; |
| letter-spacing: -0.02em;"> |
| Why LFM2.5 for Your Transaction Foundation Model |
| </h2> |
| <p style="color: {_TEXT_DIM}; font-size: 13px; margin: 0 0 24px 0; line-height: 1.5;"> |
| If you are putting a transaction foundation model into production — |
| especially across more than one business unit, customer, or downstream task — |
| the architectural choice determines per-customer training cost, |
| time-to-first-production-task, and the marginal cost of adding the second |
| and third tasks. The encoder-on-pretrained-backbone architecture applies |
| a recipe Liquid AI already ships in LFM2.5-Audio and LFM2.5-VL to |
| discrete-feature payment sequences. Three properties, each with a |
| different kind of evidence. |
| </p> |
| |
| <!-- 1. Recipe is validated --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 12px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| 1. The Recipe Is Already Shipping for Two Other Modalities |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 13px; line-height: 1.6; margin: 0 0 10px 0;"> |
| A small per-modality encoder produces continuous embeddings; a projection |
| adapter (when needed) maps them into the LFM2.5 text backbone's hidden space; |
| LoRA adapts the attention layers per customer. LFM2.5-Audio ingests waveforms |
| this way. LFM2.5-VL ingests vision patches this way. This demo applies the |
| same shape to discrete transaction tokens. |
| </p> |
| <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px;"> |
| <div style="padding: 10px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_BLUE}; |
| font-weight: 600; margin-bottom: 4px;">LFM2.5-AUDIO</div> |
| <div style="font-size: 11px; color: {_TEXT_DIM}; line-height: 1.4;"> |
| Audio encoder → projection → LFM2.5 backbone. Ships in production. |
| </div> |
| </div> |
| <div style="padding: 10px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_BLUE}; |
| font-weight: 600; margin-bottom: 4px;">LFM2.5-VL</div> |
| <div style="font-size: 11px; color: {_TEXT_DIM}; line-height: 1.4;"> |
| Vision encoder → projection → LFM2.5 backbone. Ships at multiple sizes. |
| </div> |
| </div> |
| <div style="padding: 10px; background: rgba(16,185,129,0.06); |
| border: 1px solid rgba(16,185,129,0.2); border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_GREEN}; |
| font-weight: 600; margin-bottom: 4px;">TRANSACTIONS (THIS DEMO)</div> |
| <div style="font-size: 11px; color: {_TEXT_DIM}; line-height: 1.4;"> |
| Structured encoder → frozen LFM2.5-350M + LoRA → multi-head outputs. |
| </div> |
| </div> |
| </div> |
| </div> |
| |
| <!-- 2. Serving cost / backbone speed --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 12px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| 2. The Backbone Serves at Production Latency |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 13px; line-height: 1.6; margin: 0 0 12px 0;"> |
| LFM2.5's conv-dominant layer stack gives O(N) prefill scaling on most layers |
| where a pure-attention model pays O(N²). Published hardware-in-the-loop |
| benchmarks from the LFM2 technical report, S25 / 4K context: |
| </p> |
| <table style="width: 100%; border-collapse: collapse; margin-bottom: 8px;"> |
| {_table_header(["Model", "Prefill (tok/s)", "Decode (tok/s)"])} |
| {_table_row(["LFM2-2.6B", "<b>116</b>", "<b>30.0</b>"], highlight=True)} |
| {_table_row(["Qwen3-4B", "35", "11.4"])} |
| {_table_row(["Llama-3.2-3B", "51", "15.8"])} |
| </table> |
| <p style="font-size: 11px; color: {_TEXT_DIM}; margin: 0; line-height: 1.5;"> |
| Your serving path is the published LFM2.5 backbone unchanged — only |
| the input side differs from a text deployment. The published latency |
| advantage transfers directly. |
| Source: <a href="https://arxiv.org/abs/2511.23404" style="color: {_TEXT_DIM}; |
| text-decoration: underline;">LFM2 Technical Report, arXiv 2511.23404</a>. |
| </p> |
| </div> |
| |
| <!-- 3. Frozen base + LoRA is the local maximum --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 12px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| 3. Frozen Backbone + LoRA Is the Higher-Quality Configuration at Typical Label Budgets |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 13px; line-height: 1.6; margin: 0 0 10px 0;"> |
| Freezing the LFM2.5 backbone and adapting it with LoRA produces higher |
| quality than unfreezing the full backbone end-to-end at typical finserv |
| label budgets. LoRA’s low-rank update structure acts as effective |
| regularization; lifting that constraint lets the backbone memorize the |
| training labels rather than generalize. <b>The frozen-backbone commitment |
| is not a quality compromise — it is the higher-quality operating |
| point.</b> |
| </p> |
| <table style="width: 100%; border-collapse: collapse; margin-bottom: 6px;"> |
| {_table_header(["Configuration", "Trainable", "Fraud ROC-AUC", "MCC top-1"])} |
| {_table_row(["Frozen backbone + LoRA (this demo)", "~16M", "<b>0.951</b>", "<b>40.5%</b>"], |
| highlight=True)} |
| {_table_row(["Full backbone unfreeze", "~370M", "0.900", "38.1%"])} |
| </table> |
| <p style="font-size: 11px; color: {_TEXT_DIM}; margin: 0; line-height: 1.5;"> |
| Measured on 200K synthetic sequences (64 transactions × 15 features each). |
| At ~16M trainable parameters (encoder + LoRA + heads), per-customer |
| adaptation is small relative to the deployed footprint and completes |
| in hours, not days. |
| </p> |
| </div> |
| |
| <!-- 4. Multi-head, multi-customer --> |
| <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-bottom: 12px;"> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD};"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| One Backbone, Many Heads |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 12px; line-height: 1.6; margin: 0 0 8px 0;"> |
| A single forward pass through the backbone produces hidden states that |
| four task heads pool independently — fraud detection, next-merchant |
| prediction, amount-bucket forecasting, MCC classification. New |
| use-cases (disputes, authorization optimization, AML) add a head, not |
| a foundation model. |
| </p> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT_DIM}; |
| padding: 6px 8px; background: {_BG_CARD_ALT}; border-radius: 6px;"> |
| Per-head MLP: ~0.5M params. Add a new task in hours. |
| </div> |
| </div> |
| |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD};"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| One Backbone, Many Customers |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 12px; line-height: 1.6; margin: 0 0 8px 0;"> |
| The pretrained LFM2.5 weights ship once. Per-customer training is the |
| encoder + LoRA + heads — under 5% of base size in bf16 slim format. |
| Adding a customer is loading new artifacts on top of the cached |
| backbone, not retraining from scratch. |
| </p> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT_DIM}; |
| padding: 6px 8px; background: {_BG_CARD_ALT}; border-radius: 6px;"> |
| Slim per-customer artifact: ~30 MB bf16 at LFM2.5-1.2B scale. |
| </div> |
| </div> |
| </div> |
| |
| <!-- 5. Architectural fit --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 12px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| The Architecture Matches Transaction Data Structure |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 13px; line-height: 1.6; margin: 0 0 8px 0;"> |
| Transaction data is information-dense locally (within-transaction |
| feature correlations, adjacent-transaction continuity) with sparse |
| long-range signal (behavioral baselines across the full history). |
| LFM2.5 allocates O(N) conv to the dense local patterns and O(N²) |
| attention to the sparse global ones. A pure transformer would spend |
| O(N²) compute uniformly. |
| </p> |
| <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px;"> |
| <div style="padding: 10px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT}; |
| font-weight: 600; margin-bottom: 4px;">Within Transaction</div> |
| <div style="font-size: 11px; color: {_TEXT_DIM}; line-height: 1.4;"> |
| Merchant determines MCC. Entry mode correlates with amount. Dense, |
| local, often deterministic. A 3-wide conv kernel captures this. |
| </div> |
| </div> |
| <div style="padding: 10px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT}; |
| font-weight: 600; margin-bottom: 4px;">Adjacent Transactions</div> |
| <div style="font-size: 11px; color: {_TEXT_DIM}; line-height: 1.4;"> |
| Strong temporal continuity. A customer at Starbucks at 8am is |
| likely at a similar merchant tomorrow. Local conv handles it. |
| </div> |
| </div> |
| <div style="padding: 10px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT}; |
| font-weight: 600; margin-bottom: 4px;">Distant Transactions</div> |
| <div style="font-size: 11px; color: {_TEXT_DIM}; line-height: 1.4;"> |
| Weak but non-zero signal. Behavioral profile matters for fraud |
| baseline. This is where attention earns its quadratic cost. |
| </div> |
| </div> |
| </div> |
| </div> |
| |
| <!-- 6. Data sovereignty --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 12px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 4px 0; font-size: 15px; font-weight: 600;"> |
| Your Data, Your Model, Your Infrastructure |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 13px; line-height: 1.6; margin: 0;"> |
| LFM2.5 base weights are open. Liquid licenses the architecture, training |
| recipe, and engineering support. Customers train on their proprietary |
| data behind their firewall. No data leaves customer infrastructure. |
| No dependency on external model APIs. The result is a foundation model |
| the customer owns, adapted to their transaction distribution. |
| </p> |
| </div> |
| |
| <!-- Scope: what this demo validates / what a POC would establish --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 16px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 8px 0; font-size: 14px; font-weight: 600;"> |
| Scope of Claim |
| </h3> |
| <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 12px; font-size: 12px;"> |
| <div> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_GREEN}; |
| font-weight: 600; margin-bottom: 6px; text-transform: uppercase; |
| letter-spacing: 0.05em;">What this demo validates</div> |
| <ul style="margin: 0; padding-left: 14px; color: {_TEXT_MUTED}; line-height: 1.6;"> |
| <li>The encoder-on-pretrained-backbone architecture used by LFM2.5-Audio |
| and LFM2.5-VL applies to discrete-feature transaction sequences |
| without modifying the transformers library.</li> |
| <li>Per-customer training touches ~2–5% of the deployed footprint |
| and trains in hours rather than days.</li> |
| <li>On synthetic data, frozen-backbone-plus-LoRA outperforms |
| full-backbone unfreezing on every measured head.</li> |
| <li>One pretrained backbone serves all task heads and is identical |
| across customer deployments.</li> |
| </ul> |
| </div> |
| <div> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_AMBER}; |
| font-weight: 600; margin-bottom: 6px; text-transform: uppercase; |
| letter-spacing: 0.05em;">What a POC on your data would establish</div> |
| <ul style="margin: 0; padding-left: 14px; color: {_TEXT_MUTED}; line-height: 1.6;"> |
| <li>Whether synthetic-data quality numbers reproduce on your |
| transaction distribution.</li> |
| <li>Production-scale quality at LFM2.5-1.2B on your hardware and |
| sequence lengths (this demo runs at LFM2.5-350M).</li> |
| <li>Inference latency against your authorization-decision budget |
| at your concurrency.</li> |
| <li>Cross-customer or cross-business-unit transfer of the encoder |
| and LoRA artifacts.</li> |
| </ul> |
| </div> |
| </div> |
| </div> |
| |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT_DIM}; text-align: center;"> |
| Architecture: <a href="https://arxiv.org/abs/2511.23404" style="color: {_TEXT_DIM}; |
| text-decoration: underline;">arXiv 2511.23404</a> · |
| Weights: <a href="https://huggingface.co/LiquidAI" style="color: {_TEXT_DIM}; |
| text-decoration: underline;">huggingface.co/LiquidAI</a> |
| </div> |
| </div> |
| """ |
|
|
|
|
| def render_encoder_integration() -> str: |
| """Build-it-yourself integration guide. |
| |
| Walks the reader through every component a customer team would build |
| to reproduce this demo on their own data: preprocessing, encoder, |
| backbone wiring, heads, postprocessing, training, deployment. |
| Includes hyperparameter cards, gotchas, and an engagement timeline. |
| """ |
|
|
| def _phase_card(num: str, title: str, body: str, detail: str) -> str: |
| return f""" |
| <div style="padding: 14px 16px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD};"> |
| <div style="display: flex; align-items: baseline; gap: 8px; margin-bottom: 6px;"> |
| <span style="font-family: {_FONT_MONO}; font-size: 11px; color: {_TEXT_DIM}; |
| font-weight: 600;">{num}</span> |
| <span style="font-size: 14px; font-weight: 600; color: {_TEXT};">{title}</span> |
| </div> |
| <p style="font-size: 12px; color: {_TEXT_MUTED}; line-height: 1.5; margin: 0 0 8px 0;"> |
| {body}</p> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT_DIM}; |
| padding: 6px 8px; background: {_BG_CARD_ALT}; border-radius: 6px; |
| line-height: 1.5;"> |
| {detail}</div> |
| </div>""" |
|
|
| def _gotcha(num: str, title: str, desc: str) -> str: |
| return f""" |
| <div style="display: flex; gap: 8px; padding: 5px 0; |
| border-bottom: 1px solid {_BORDER_SUBTLE};"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT_DIM}; |
| font-weight: 600; min-width: 18px;">{num}.</div> |
| <div> |
| <span style="font-size: 12px; font-weight: 600; color: {_TEXT};">{title}</span> |
| <span style="font-size: 12px; color: {_TEXT_MUTED};"> — {desc}</span> |
| </div> |
| </div>""" |
|
|
| def _pill(text: str) -> str: |
| return ( |
| f'<span style="padding: 5px 12px; background: {_TEXT}; color: #fff;' |
| f' border-radius: 9999px; font-family: {_FONT_MONO};' |
| f' font-size: 10px; font-weight: 600;">{text}</span>' |
| ) |
|
|
| arrow = f'<span style="color: {_TEXT_DIM}; font-size: 12px;">→</span>' |
|
|
| return f""" |
| <div style="max-width: {_CONTAINER_WIDTH}; margin: 0 auto; padding: 16px; |
| font-family: -apple-system, BlinkMacSystemFont, Segoe UI, Roboto, sans-serif;"> |
| |
| <h2 style="margin: 0 0 4px 0; color: {_TEXT}; font-size: 22px; font-weight: 700; |
| letter-spacing: -0.02em;"> |
| Integration Architecture |
| </h2> |
| <p style="color: {_TEXT_DIM}; font-size: 13px; margin: 0 0 20px 0; line-height: 1.5;"> |
| How a customer team builds this stack end to end. Six components, three |
| ship from Liquid (LFM2.5 base weights, training recipes, architecture |
| support); three are customer-bespoke (schema, encoder, task heads). |
| Per-customer adaptation is one ML engineer for a few weeks, not a |
| research project. |
| </p> |
| |
| <!-- Pipeline flow --> |
| <div style="display: flex; align-items: center; justify-content: center; gap: 6px; |
| margin-bottom: 24px; padding: 10px 0; flex-wrap: wrap;"> |
| {_pill("Preprocess")}{arrow} |
| {_pill("Encode")}{arrow} |
| {_pill("Backbone + LoRA")}{arrow} |
| {_pill("Heads")}{arrow} |
| {_pill("Postprocess")}{arrow} |
| {_pill("Deploy")} |
| </div> |
| |
| <!-- Phase cards --> |
| <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 12px; margin-bottom: 20px;"> |
| |
| {_phase_card( |
| "1", |
| "Schema & Preprocessing", |
| "Define the discrete feature schema first — features, vocab sizes, ordering. " |
| "Categorical features (merchant_id, MCC, country) map directly to integer IDs. " |
| "Continuous features (amount, days-since-last) get quantile-bucketed into N bins. " |
| "High-cardinality features (10K+ merchants) have their long tail bucketed or " |
| "factored. Reserve 3 token IDs per feature for MASK / OOV / NULL. " |
| "Final per-customer batch shape: (B, T_tx, F).", |
| "Sequence: 64 tx × 15 feat = 960 tokens | " |
| "amount → 16 quantile bins | " |
| "merchant_id top-10K + frequency bucketing for the tail | " |
| "unseen values at inference → OOV (ID 1)" |
| )} |
| |
| {_phase_card( |
| "2", |
| "Structured Encoder", |
| "One embedding table per feature (sized to its vocab) plus a shared " |
| "feature-type table. Value + type embeddings are summed to identify " |
| "which feature each token represents. The 15 per-tx feature embeddings " |
| "are kept as separate positions in the sequence — compressing them to " |
| "one token per transaction collapses fraud quality (fraud ROC-AUC drops " |
| "to 0.535 on this demo's data), because the per-tx MLP averages away the " |
| "intra-tx feature combinations fraud depends on. This is the same shape " |
| "as the audio and vision encoders' input embedding step.", |
| "Output shape: (B, T_tx*F, d_lfm) = (B, 960, 1024) at LFM2.5-350M | " |
| "value_tables[f](token) + type_table(f) | " |
| "Encoder params dominated by high-cardinality value tables (~14M at 350M)" |
| )} |
| |
| {_phase_card( |
| "3", |
| "Projection Adapter (When Needed)", |
| "When the encoder's output dimension matches d_lfm directly, no adapter " |
| "is needed — the encoder outputs flow straight into the backbone. When " |
| "d_encoder < d_lfm (typical at LFM2.5-1.2B where d_lfm=2048), a single " |
| "linear projection lifts the encoder output into the backbone hidden space, " |
| "exactly mirroring the audio/VL projection adapter. Layer init: identity " |
| "for d_encoder=d_lfm, Xavier for the projection case.", |
| "350M: d_lfm=1024, d_encoder=1024, no adapter (identity) | " |
| "1.2B: d_lfm=2048, project from d_encoder=512-1024 → 2048 | " |
| "Adds ~2M params at 1.2B scale" |
| )} |
| |
| {_phase_card( |
| "4", |
| "Backbone + LoRA", |
| "Load the pretrained LFM2.5 base from Hugging Face. The backbone’s " |
| "parameters are excluded from the optimizer’s parameter set during " |
| "training — gradients flow through the backbone to update the upstream " |
| "encoder and downstream heads, but the backbone’s own weights are " |
| "never modified. Forward pass executes through all 354M backbone " |
| "parameters at full capacity, at both training and inference time. " |
| "Customer-distribution adaptation enters through (i) LoRA’s low-rank " |
| "delta on the attention projections (q_proj / k_proj / v_proj / out_proj) " |
| "and (ii) the per-feature encoder, both trained from scratch on customer " |
| "labels. Encoder outputs are injected via the published " |
| "<code>inputs_embeds</code> hook in <code>Lfm2Model.forward</code>. " |
| "Adding LoRA to the conv layers does not improve quality enough to justify the ~50% increase in training cost; attention-only LoRA is the recommended starting configuration.", |
| "Backbone params excluded from optimizer; backbone forward at full capacity | " |
| "LoRA r=16, α=32, dropout 0.05 on q_proj / k_proj / v_proj / out_proj | " |
| "PEFT wraps the leaf modules | " |
| "~1M LoRA params at 350M, ~2M at 1.2B" |
| )} |
| |
| {_phase_card( |
| "5", |
| "Task Heads", |
| "Per-task downstream heads pool backbone hidden states and predict via " |
| "small MLPs. Fraud (BCE loss) pools the last-transaction stripe — " |
| "mean of positions T-F..T (positions 945..959). Categorical heads " |
| "(next-merchant, amount-bucket, MCC) use cross-entropy and pool the " |
| "<i>pre-last</i> transaction stripe (positions 930..944) to avoid " |
| "leaking the prediction target. New tasks add a head, backbone " |
| "untouched.", |
| "Per-head MLP: 128 hidden, dropout 0.1 | " |
| "Pool: <code>last_tx_mean</code> for sequence tasks | " |
| "Pool: <code>pre_last_tx_mean</code> for next-tx tasks | " |
| "~0.5M params per head" |
| )} |
| |
| {_phase_card( |
| "6", |
| "Postprocessing", |
| "Fraud logits → sigmoid → probability in [0, 1]; calibrate against the " |
| "customer's operational threshold (typical: 70% precision @ 60% recall " |
| "for review-queue handoff). Categorical logits → softmax → top-k " |
| "distribution. Use the predicted distribution for downstream " |
| "decisioning, not just argmax — the runner-up matters when the top-1 " |
| "is uncertain. Behavioral attribution: gradient-based saliency on the " |
| "per-feature embeddings identifies which input features drove the score.", |
| "Fraud: sigmoid(logits) → operational threshold | " |
| "Categorical: softmax(logits) → top-k + calibration | " |
| "Saliency: ∂loss/∂value_embed identifies driving features" |
| )} |
| </div> |
| |
| <!-- Training recipe (full-width card) --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 16px;"> |
| <h3 style="color: {_TEXT}; margin: 0 0 8px 0; font-size: 15px; font-weight: 600;"> |
| Training Recipe |
| </h3> |
| <p style="color: {_TEXT_MUTED}; font-size: 13px; line-height: 1.6; margin: 0 0 12px 0;"> |
| Single-stage supervised fine-tune on the customer’s labelled data — no |
| separate pretraining stage. Three trainable parameter groups (LoRA delta, |
| per-feature encoder, task heads), three learning rates, because each group |
| differs in initialization, parameter scale, and gradient-norm profile. |
| </p> |
| <div style="display: grid; grid-template-columns: 1fr 1fr 1fr; gap: 10px; |
| margin-bottom: 12px;"> |
| <div style="padding: 12px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_BLUE}; |
| font-weight: 600; margin-bottom: 6px; letter-spacing: 0.05em;"> |
| LORA GROUP</div> |
| <div style="font-family: {_FONT_MONO}; font-size: 11px; color: {_TEXT}; |
| margin-bottom: 4px;"> |
| lr = 1e-3 · ~1M params |
| </div> |
| <div style="font-size: 12px; color: {_TEXT_MUTED}; line-height: 1.5;"> |
| Low-rank adapters on the backbone’s attention projections. |
| Initialized so the LoRA path contributes zero at step 0, then steers |
| attention behavior toward the customer’s distribution. Higher LR |
| than the encoder group is fine — the low-rank constraint regularizes |
| the update by construction. |
| </div> |
| </div> |
| <div style="padding: 12px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_BLUE}; |
| font-weight: 600; margin-bottom: 6px; letter-spacing: 0.05em;"> |
| ENCODER GROUP</div> |
| <div style="font-family: {_FONT_MONO}; font-size: 11px; color: {_TEXT}; |
| margin-bottom: 4px;"> |
| lr = 3e-4 · ~14M params |
| </div> |
| <div style="font-size: 12px; color: {_TEXT_MUTED}; line-height: 1.5;"> |
| Per-feature value tables + feature-type table, from random init on the |
| customer’s tokenized vocabulary. Lower LR than LoRA because |
| random-init embedding matrices destabilize at higher rates; |
| high-cardinality tables (10K-vocab merchant) dominate gradient norm if |
| not damped. |
| </div> |
| </div> |
| <div style="padding: 12px; background: {_BG_CARD_ALT}; border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_ACCENT_BLUE}; |
| font-weight: 600; margin-bottom: 6px; letter-spacing: 0.05em;"> |
| HEADS GROUP</div> |
| <div style="font-family: {_FONT_MONO}; font-size: 11px; color: {_TEXT}; |
| margin-bottom: 4px;"> |
| lr = 1e-3 · ~2M params |
| </div> |
| <div style="font-size: 12px; color: {_TEXT_MUTED}; line-height: 1.5;"> |
| Per-task MLPs (fraud, next-merchant, amount-bucket, MCC), from random |
| init. Higher LR is fine — small per-head parameter count, well-conditioned |
| loss surface. New downstream tasks attach as additional heads without |
| retraining the backbone or the encoder. |
| </div> |
| </div> |
| </div> |
| <div style="display: grid; grid-template-columns: auto 1fr; gap: 6px 16px; |
| font-size: 12px; padding: 10px 12px; background: {_BG_CARD_ALT}; |
| border-radius: 8px;"> |
| <div style="font-family: {_FONT_MONO}; color: {_TEXT_DIM};">Optimizer</div> |
| <div style="color: {_TEXT_MUTED};"> |
| AdamW, β = (0.9, 0.95), weight decay 0.1 |
| </div> |
| <div style="font-family: {_FONT_MONO}; color: {_TEXT_DIM};">Schedule</div> |
| <div style="color: {_TEXT_MUTED};"> |
| 200-step linear warmup, cosine decay to 10% of peak over ~5K steps |
| </div> |
| <div style="font-family: {_FONT_MONO}; color: {_TEXT_DIM};">Precision</div> |
| <div style="color: {_TEXT_MUTED};"> |
| bf16 forward and backward, fp32 loss accumulation |
| </div> |
| <div style="font-family: {_FONT_MONO}; color: {_TEXT_DIM};">Multi-task</div> |
| <div style="color: {_TEXT_MUTED};"> |
| fraud 1.0, categorical heads 0.5 each — chosen to match per-task |
| gradient norm in the first 200 warmup steps |
| </div> |
| <div style="font-family: {_FONT_MONO}; color: {_TEXT_DIM};">Compute</div> |
| <div style="color: {_TEXT_MUTED};"> |
| ~2 hours end-to-end on a single A100 at LFM2.5-350M scale |
| </div> |
| </div> |
| </div> |
| |
| <!-- Deployment card (full-width) --> |
| <div style="margin-bottom: 16px;"> |
| {_phase_card( |
| "Deploy", |
| "Per-Customer Adapter on a Shared Backbone", |
| "The deployable per-customer artifact is the trained LoRA delta + " |
| "per-feature encoder + task heads. The LFM2.5 base is not included; " |
| "it is loaded once per serving GPU from the public weights. At " |
| "LFM2.5-350M the artifact is ~190 MB in bf16; the unstripped version " |
| "including the base would be ~900 MB. Multi-tenant serving keeps " |
| "one backbone resident on the GPU and switches the active LoRA " |
| "delta + encoder per request, so a new customer adds a small " |
| "adapter rather than a second foundation model. The conv-dominant " |
| "backbone quantizes cleanly to INT8 for cost-sensitive deployments. " |
| "Same code path runs CPU or GPU.", |
| "Per-customer artifact: ~190 MB bf16 at LFM2.5-350M (LoRA + encoder + heads) | " |
| "Multi-tenant: shared backbone forward + per-request LoRA switching | " |
| "CPU (~5s/inference) or H100 (<100ms), same code | " |
| "INT8 quantization clean for conv layers" |
| )} |
| </div> |
| |
| <!-- Gotchas --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 20px;"> |
| <div style="font-size: 14px; font-weight: 600; color: {_TEXT}; margin-bottom: 8px;"> |
| Configuration Choices That Look Right and Aren’t |
| </div> |
| {_gotcha("1", "Do not compress each transaction to one token", |
| "an MLP that averages the 15 feature embeddings into a single " |
| "vector destroys the intra-tx fraud signal. Fraud ROC-AUC " |
| "collapsed to 0.535. Keep the full T_tx*F stripe.")} |
| {_gotcha("2", "Pool the pre-last transaction, not the last, for next-tx heads", |
| "pooling the last-tx stripe for next-merchant prediction " |
| "leaks the prediction target into the input. Use the prior tx.")} |
| {_gotcha("3", "Don't unfreeze the backbone at typical label budgets", |
| "full-backbone unfreezing produces lower quality than frozen-plus-LoRA " |
| "(fraud ROC-AUC 0.951 → 0.900 on this demo's data). LoRA acts as " |
| "effective regularization; lifting it forces overfitting.")} |
| {_gotcha("4", "Tied embedding heads need SSL-pretrained value tables", |
| "tying the next-merchant head to the encoder's merchant value table " |
| "without self-supervised pretraining of those tables reduces " |
| "next-merchant top-1 from 7.78% to 3.74%. Use a fresh MLP head " |
| "until SSL pretraining anchors the value tables.")} |
| {_gotcha("5", "Average per-feature losses; do not sum", |
| "summing CE losses across features makes high-cardinality " |
| "features (10K-vocab merchant) dominate the gradient. The " |
| "low-cardinality features stop training.")} |
| {_gotcha("6", "Match the schema fingerprint between training and inference", |
| "if the tokenizer's vocab changes between training and " |
| "deployment, the encoder's value tables index into a different " |
| "semantic space. Embed the fingerprint in checkpoint metadata.")} |
| </div> |
| |
| <!-- Engagement model --> |
| <div style="padding: 16px 20px; background: {_BG_CARD}; border: 1px solid {_BORDER}; |
| border-radius: {_RADIUS_CARD}; margin-bottom: 16px;"> |
| <div style="font-size: 14px; font-weight: 600; color: {_TEXT}; margin-bottom: 10px;"> |
| Typical Engagement |
| </div> |
| <table style="width: 100%; border-collapse: collapse;"> |
| <tr style="border-bottom: 1px solid {_BORDER};"> |
| <th style="padding: 6px 10px; text-align: left; font-size: 10px; |
| color: {_TEXT_DIM}; text-transform: uppercase; |
| letter-spacing: 0.05em; font-weight: 600;">Phase</th> |
| <th style="padding: 6px 10px; text-align: left; font-size: 10px; |
| color: {_TEXT_DIM}; text-transform: uppercase; font-weight: 600;"> |
| Duration</th> |
| <th style="padding: 6px 10px; text-align: left; font-size: 10px; |
| color: {_TEXT_DIM}; text-transform: uppercase; font-weight: 600;"> |
| What Happens</th> |
| </tr> |
| <tr style="border-bottom: 1px solid {_BORDER_SUBTLE};"> |
| <td style="padding: 5px 10px; font-size: 12px; font-weight: 600; |
| color: {_TEXT};">Discovery</td> |
| <td style="padding: 5px 10px; font-family: {_FONT_MONO}; font-size: 11px; |
| color: {_TEXT_MUTED};">1-2 weeks</td> |
| <td style="padding: 5px 10px; font-size: 12px; color: {_TEXT_MUTED};"> |
| Schema design, data sample (~100K-1M sequences), compliance review, |
| architectural fit assessment. |
| </td> |
| </tr> |
| <tr style="border-bottom: 1px solid {_BORDER_SUBTLE};"> |
| <td style="padding: 5px 10px; font-size: 12px; font-weight: 600; |
| color: {_TEXT};">POC</td> |
| <td style="padding: 5px 10px; font-family: {_FONT_MONO}; font-size: 11px; |
| color: {_TEXT_MUTED};">1 week</td> |
| <td style="padding: 5px 10px; font-size: 12px; color: {_TEXT_MUTED};"> |
| Fine-tune encoder + LoRA + heads on customer sample, measurement |
| report, go/no-go recommendation. |
| </td> |
| </tr> |
| <tr style="border-bottom: 1px solid {_BORDER_SUBTLE};"> |
| <td style="padding: 5px 10px; font-size: 12px; font-weight: 600; |
| color: {_TEXT};">Production</td> |
| <td style="padding: 5px 10px; font-family: {_FONT_MONO}; font-size: 11px; |
| color: {_TEXT_MUTED};">2-3 months</td> |
| <td style="padding: 5px 10px; font-size: 12px; color: {_TEXT_MUTED};"> |
| Customer engineering team builds with Liquid architectural support, |
| weekly design review, scale-up to LFM2.5-1.2B. |
| </td> |
| </tr> |
| <tr> |
| <td style="padding: 5px 10px; font-size: 12px; font-weight: 600; |
| color: {_TEXT};">Scale</td> |
| <td style="padding: 5px 10px; font-family: {_FONT_MONO}; font-size: 11px; |
| color: {_TEXT_MUTED};">Ongoing</td> |
| <td style="padding: 5px 10px; font-size: 12px; color: {_TEXT_MUTED};"> |
| Multi-task expansion (add heads), multi-tenant serving, retraining |
| cadence, architecture evolution. |
| </td> |
| </tr> |
| </table> |
| </div> |
| |
| <div style="font-family: {_FONT_MONO}; font-size: 10px; color: {_TEXT_DIM}; text-align: center;"> |
| Architecture: <a href="https://arxiv.org/abs/2511.23404" style="color: {_TEXT_DIM}; |
| text-decoration: underline;">arXiv 2511.23404</a> · |
| Base weights: <a href="https://huggingface.co/LiquidAI" style="color: {_TEXT_DIM}; |
| text-decoration: underline;">huggingface.co/LiquidAI</a> |
| </div> |
| </div> |
| """ |
|
|