Spaces:

MukulRay
/

recon

Sleeping

App Files Files Community

MukulRay commited on Mar 28

Commit

5f2dbd3

1 Parent(s): 0f02637

Phase 9: Gradio UI, critic debug panel, claim table, session sidebar, MD export

Browse files

Files changed (6) hide show

app.py +233 -0
data/cache/7dbd31b978cf26642382f8e8fbf51d4c.json +1 -0
data/cache/af237adc7793290b2c582b6d100fead6.json +1 -0
data/cache/cc860319625f8f1b0658f48c26ada610.json +1 -0
data/cache/e5c4fcb986d222d9f17c613ce1b46edb.json +1 -0
data/cache/ee737556fd55e47471ca6ec835d5b80f.json +1 -0

app.py CHANGED Viewed

	@@ -0,0 +1,233 @@

+import gradio as gr
+import uuid
+import os
+import tempfile
+import logging
+from dotenv import load_dotenv
+from src.graph import run_recon
+from src.memory import init_db, load_session, export_session_md
+load_dotenv()
+logging.basicConfig(level=logging.WARNING)
+logger = logging.getLogger(__name__)
+init_db()
+# ---------------------------------------------------------------------------
+# Core run function
+# ---------------------------------------------------------------------------
+def run_query(query, session_id, decay_config, history):
+    if not query.strip():
+        yield history, session_id, "*No query entered.*", "*No query entered.*", "*No session.*", "", None
+        return
+    if not session_id.strip():
+        session_id = str(uuid.uuid4())
+    # Add user message
+    history = history + [{"role": "user", "content": query}]
+    yield history, session_id, "🔍 Running pipeline...", "", "*Updating...*", "", None
+    try:
+        result = run_recon(
+            query=query,
+            session_id=session_id,
+            decay_config=decay_config,
+        )
+    except Exception as e:
+        logger.error(f"Pipeline error: {e}")
+        history = history + [{"role": "assistant", "content": f"❌ Error: {e}"}]
+        yield history, session_id, f"❌ Error: {e}", "", "", "", None
+        return
+    position = result.get("synthesized_position", "No position generated.")
+    history = history + [{"role": "assistant", "content": position}]
+    # Critic panel
+    verdict = result.get("critic_verdict", "N/A")
+    critic_notes = result.get("critic_notes", "")
+    retry_count = result.get("retry_count", 0)
+    latency = result.get("latency_ms", 0)
+    papers_used = len(result.get("retrieved_papers") or [])
+    verdict_emoji = {
+        "PASS": "✅", "FORCED_PASS": "⚠️",
+        "STALE": "🕰️", "CONTRADICTED": "⚡", "INSUFFICIENT": "📉",
+    }.get(verdict, "❓")
+    critic_panel = f"""### Critic Debug Panel
+**Verdict:** {verdict_emoji} `{verdict}`
+**Notes:** {critic_notes}
+**Retries:** {retry_count}
+**Papers used:** {papers_used}
+**Latency:** {latency:.0f}ms
+**Decay config:** {result.get('decay_config', 'linear')}
+"""
+    if result.get("rewritten_questions"):
+        critic_panel += "\n**Rewritten questions:**\n"
+        for q in result["rewritten_questions"]:
+            critic_panel += f"- {q}\n"
+    # Claims table
+    claims = result.get("claim_confidences") or []
+    if claims:
+        claims_md = "### Claim Confidence Table\n\n"
+        claims_md += "| Confidence | Claim | Source | Year | Flag |\n"
+        claims_md += "|-----------|-------|--------|------|------|\n"
+        for c in claims:
+            flag = "⚠️" if c.flagged else ""
+            claim_text = c.text[:80] + "..." if len(c.text) > 80 else c.text
+            source = c.source_title[:35] + "..." if len(c.source_title) > 35 else c.source_title
+            conf_emoji = {"high": "🟢", "medium": "🟡", "low": "🔴"}.get(c.confidence, "⚪")
+            claims_md += f"| {conf_emoji} {c.confidence.upper()} | {claim_text} | {source} | {c.source_year} | {flag} |\n"
+    else:
+        claims_md = "*No claims extracted.*"
+    # Session sidebar
+    session_ctx = load_session(session_id)
+    session_md = f"### Session `{session_id[:8]}...`\n\n"
+    session_md += f"**Turns:** {len(session_ctx.prior_queries)}\n\n"
+    if session_ctx.prior_queries:
+        session_md += "**Queries:**\n"
+        for q in session_ctx.prior_queries:
+            session_md += f"- {q[:60]}\n"
+    if session_ctx.flagged_contradictions:
+        session_md += "\n**Contradictions flagged:**\n"
+        for c in session_ctx.flagged_contradictions[:3]:
+            session_md += f"- {c[:80]}\n"
+    export_md = result.get("export_md", "")
+    yield history, session_id, critic_panel, claims_md, session_md, export_md, None
+def export_md_file(export_md_content, session_id):
+    if not export_md_content.strip():
+        return None
+    try:
+        fname = f"recon_session_{session_id[:8]}.md"
+        path = os.path.join(tempfile.gettempdir(), fname)
+        with open(path, "w", encoding="utf-8") as f:
+            f.write(export_md_content)
+        return path
+    except Exception as e:
+        logger.error(f"Export failed: {e}")
+        return None
+def new_session():
+    new_id = str(uuid.uuid4())
+    return new_id, [], "*Critic verdict will appear here.*", "*Run a query to see claims.*", "*Session history will appear here.*", "", None
+# ---------------------------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------------------------
+with gr.Blocks(title="RECON — Multi-Agent Research Navigator") as demo:
+    gr.Markdown("""
+# 🔍 RECON — Multi-Agent Research Navigator
+**Temporally-aware ML literature research with staleness detection and contradiction flagging.**
+Enter a research query about any ML topic. RECON retrieves live papers, evaluates evidence quality, and synthesizes a cited research position.
+""")
+    session_id_state = gr.State(str(uuid.uuid4()))
+    export_md_state = gr.State("")
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(
+                label="Research Position",
+                height=500,
+            )
+            with gr.Row():
+                query_input = gr.Textbox(
+                    placeholder="e.g. What is the current state of KV cache compression in LLMs?",
+                    label="Research Query",
+                    scale=4,
+                    lines=2,
+                )
+                submit_btn = gr.Button("🔍 Research", variant="primary", scale=1)
+            with gr.Row():
+                decay_dropdown = gr.Dropdown(
+                    choices=["linear", "log", "none"],
+                    value="linear",
+                    label="Recency decay",
+                    scale=1,
+                )
+                new_session_btn = gr.Button("🔄 New Session", scale=1)
+                session_display = gr.Textbox(
+                    label="Session ID",
+                    interactive=False,
+                    scale=2,
+                )
+            claims_output = gr.Markdown(
+                value="*Run a query to see claim confidence scores.*"
+            )
+        with gr.Column(scale=2):
+            critic_output = gr.Markdown(
+                value="*Critic verdict will appear here after each query.*"
+            )
+            gr.Markdown("---")
+            session_output = gr.Markdown(
+                value="*Session history will appear here.*"
+            )
+            gr.Markdown("---")
+            export_btn = gr.Button("📥 Export Session (.md)", variant="secondary")
+            export_file = gr.File(label="Download")
+    # ---------------------------------------------------------------------------
+    # Event handlers
+    # ---------------------------------------------------------------------------
+    def on_submit(query, session_id, decay_config, history):
+        for result in run_query(query, session_id, decay_config, history):
+            chat, sid, critic, claims, session, export_md, _ = result
+            yield chat, sid, critic, claims, session, export_md, sid
+    submit_btn.click(
+        fn=on_submit,
+        inputs=[query_input, session_id_state, decay_dropdown, chatbot],
+        outputs=[chatbot, session_id_state, critic_output, claims_output,
+                 session_output, export_md_state, session_display],
+    )
+    query_input.submit(
+        fn=on_submit,
+        inputs=[query_input, session_id_state, decay_dropdown, chatbot],
+        outputs=[chatbot, session_id_state, critic_output, claims_output,
+                 session_output, export_md_state, session_display],
+    )
+    new_session_btn.click(
+        fn=new_session,
+        outputs=[session_id_state, chatbot, critic_output,
+                 claims_output, session_output, export_md_state, export_file],
+    )
+    export_btn.click(
+        fn=export_md_file,
+        inputs=[export_md_state, session_id_state],
+        outputs=[export_file],
+    )
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+    )

data/cache/7dbd31b978cf26642382f8e8fbf51d4c.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"url": "https://arxiv.org/html/2508.06297v1", "snippet": "KVcachecompressionis a key technology for optimizing the inference efficiency ofLLMs, primarily by compressing the key and value tensors in the self-attention mechanism to reduce memory usage and improve computational efficiency.", "title": "KV Cache Compression for Inference Efficiency in LLMs: A Review - arXiv.org", "inferred_year": null, "hybrid_score": 0.0, "source": "duckduckgo"}, {"url": "https://www.semanticscholar.org/paper/KV-Cache-Compression-for-Inference-Efficiency-in-A-Liu-Fu/1a3bb1d6f9ce653baa68311df6481ed51535d99b", "snippet": "This review systematically examines currentKVcacheoptimization techniques, includingcompressionstrategies such as selective token strategies, quantization, and attentioncompression, providing a comprehensive analysis of their impact on memory usage and inference speed. With the rapid advancement of large language models (LLMs), the context length for inference continuously increases ...", "title": "KV Cache Compression for Inference Efficiency in LLMs: A Review", "inferred_year": null, "hybrid_score": 0.0, "source": "duckduckgo"}, {"url": "https://arxiv.org/html/2502.01941v1", "snippet": "Abstract Thispaperinvestigates an under-explored challenge in large language models (LLMs): the impact ofKVcachecompressionmethodsonLLMs'fundamental capabilities. While existingmethodsachieve impressivecompressionratios on long-context benchmarks, their effects on core model capabilities remain understudied.", "title": "Can LLMs Maintain Fundamental Abilities under KV Cache Compression?", "inferred_year": null, "hybrid_score": 0.0, "source": "duckduckgo"}]

data/cache/af237adc7793290b2c582b6d100fead6.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"title": "Inferix: A Block-Diffusion based Next-Generation Inference Engine for World Simulation", "abstract": "World models serve as core simulators for fields such as agentic AI, embodied AI, and gaming, capable of generating long, physically realistic, and interactive high-quality videos. Moreover, scaling these models could unlock emergent capabilities in visual perception, understanding, and reasoning, paving the way for a new paradigm that moves beyond current LLM-centric vision foundation models. A key breakthrough empowering them is the semi-autoregressive (block-diffusion) decoding paradigm, which merges the strengths of diffusion and autoregressive methods by generating video tokens in block-applying diffusion within each block while conditioning on previous ones, resulting in more coherent and stable video sequences. Crucially, it overcomes limitations of standard video diffusion by reintroducing LLM-style KV Cache management, enabling efficient, variable-length, and high-quality generation. Therefore, Inferix is specifically designed as a next-generation inference engine to enable immersive world synthesis through optimized semi-autoregressive decoding processes. This dedicated focus on world simulation distinctly sets it apart from systems engineered for high-concurrency scenarios (like vLLM or SGLang) and from classic video diffusion models (such as xDiTs). Inferix further enhances its offering with interactive video streaming and profiling, enabling real-time interaction and realistic simulation to accurately model world dynamics. Additionally, it supports efficient benchmarking through seamless integration of LV-Bench, a new fine-grained evaluation benchmark tailored for minute-long video generation scenarios. We hope the community will work together to advance Inferix and foster world model exploration.", "year": 2025, "citation_count": 3, "paper_id": "97f44b3e6333afffadaade175b4132d92fe6b297", "authors": ["Tianyu Feng", "Yizeng Han", "Jiahao He", "Yuanyu He", "Xi Lin", "Teng Liu", "Hanfeng Lu", "Jiasheng Tang", "Wei Wang", "Zhiyuan Wang", "Jichao Wu", "Mingyan Yang", "Yinghao Yu", "Zeyu Zhang", "Bohan Zhuang"], "references": ["5d4088ab2efc4a4951fda5100b9329ae485ff3d9", "40b45da7ce1beeac6b251dbd3fe6557f6a5b4533", "94e338cd656e602300632cae6365aad244fc6010", "a6939d2e5d831c1c6b249670d2e655ae4ee22182", "a8e2e3ff1770fd83228659e9e4d16114ddb9404b", "a1c6ac272065719a74050ba79b52020687b4debc", "3400ad801cba5b356e15d346c6eb88ad95b7253f", "77bfbd9259af282c8b1c5e59ed363fd03958a5cd", "8877e9a5bf473404c35b758faa76905c202f0f36", "6f1f0a76d30bf2a55397c1b29da682c443fa88b5", "296e70f6b24b3eb5747b933dabaabd361db00c69", "e6068903ecdf6ecdd3c17f3bde38e3d1dd5ffebd", "ccc2b6b8b5c0b8d33433dfbfe5ae8b8045b5173a", "047f0dcc0c73edc222be828136e50c335137545f", "e4ace471c7675b107644486b933f5dfe621f5e79", "c738c5630f454d1722f936cdaeec053744695c96", "e4275fdad6226f79449774b388921200d095bdd5", "f2d0f3d47ae850f49a58f4977393bd0025af4bec", "e10ba28cd6dab4a292acdab0a0733a89de1ba640", "964440957c030504f6bcab11f514635ece1bf6b2", "dedcfd974eb7a8ab3d02b42561e4448883f0215a", "cc6fc3c546b354abf6a0aa3b553f28a6b812489f", "db1cd3514fa2baa0e780fb51ecadf4b37da9caac", "a3e000e0d7f64c1d094c2a8bf6f43992cbabe91b", "4e9a8141da2a8c603722b07d096109207f8e0b66", "f68f6f2a057c4e6e5a3c91fc8563533d9bf6e560", "02ad9f3fefe33cb9ca546591bec65dbdf7766c80", "83b90f4a0ae4cc214eb3cc140ccfef9cd99fac05", "42a14d824caa3348046eb34c37e2ab7985faa7a3", "736973165f98105fec3729b7db414ae4d80fcbeb", "8a349ff8222986274f302bf85c1f53a7ffafbf54", "e1a3e6856b6ac6af3600b5954392e5368603fd1b", "9eb3584dc1193ea9192be8df6a3b57aebd3b8548", "673d3ca3daa4eee72a1ee64d0680e66f869716af", "b976a5738538ea8aed7396628fc0a3ad17b30787", "5b6fdb2aea424d02c141da81d62b04d739d62b96", "38b99dbe4f49c6b265abb8f2b703bd53e1b53459"], "hybrid_score": 0.4464, "source": "semantic_scholar"}]

data/cache/cc860319625f8f1b0658f48c26ada610.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"url": "https://arxiv.org/html/2508.06297v1", "snippet": "August 8, 2025 -We focus on identifying the limitations and challenges of existing methods, such ascompatibility issues with different models and tasks. Additionally, this review highlights future research directions, including hybrid optimization techniques, adaptive dynamic strategies, and software-hardware ...", "title": "KV Cache Compression for Inference Efficiency in LLMs: A Review", "inferred_year": 2025, "hybrid_score": 0.0, "source": "duckduckgo"}, {"url": "https://arxiv.org/html/2502.01941v2", "snippet": "May 21, 2025 -This paper investigates an underexplored challenge in large language models (LLMs): the impact of KV cache compression methods on LLMs\u2019 fundamental capabilities. Although existing methods achieve impressive compression ratios on long-context benchmarks, their effects on core model capabilities remain understudied.", "title": "Can LLMs Maintain Fundamental Abilities under KV Cache Compression?", "inferred_year": 2025, "hybrid_score": 0.0, "source": "duckduckgo"}, {"url": "https://arxiv.org/abs/2502.01941", "snippet": "May 21, 2025 -This paper investigates an underexplored challenge in large language models (LLMs):the impact of KV cache compression methods on LLMs' fundamental capabilities. Although existing methods achieve impressive compression ratios on long-context ...", "title": "[2502.01941] Can LLMs Maintain Fundamental Abilities under KV Cache Compression?", "inferred_year": 2025, "hybrid_score": 0.0, "source": "duckduckgo"}]

data/cache/e5c4fcb986d222d9f17c613ce1b46edb.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"title": "KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache", "abstract": "Efficiently serving large language models (LLMs) requires batching of many requests to reduce the cost per request. Yet, with larger batch sizes and longer context lengths, the key-value (KV) cache, which stores attention keys and values to avoid re-computations, significantly increases memory demands and becomes the new bottleneck in speed and memory usage. Additionally, the loading of the KV cache causes the computational core to be idle, which limits the inference speed. A straightforward and effective solution to reduce KV cache size is quantization, which decreases the total bytes taken by KV cache. However, there is a lack of in-depth studies that explore the element distribution of KV cache to understand the hardness and limitation of KV cache quantization. To fill the gap, we conducted a comprehensive study on the element distribution in KV cache of popular LLMs. Our findings indicate that the key cache should be quantized per-channel, i.e., group elements along the channel dimension and quantize them together. In contrast, the value cache should be quantized per-token. From this analysis, we developed a tuning-free 2bit KV cache quantization algorithm named KIVI. With hardware-friendly implementation, KIVI can enable Llama, Falcon, and Mistral models to maintain almost the same quality while using $\\mathbf{2.6\\times}$ less peak memory (including model weight). This reduction in memory usage enables up to $\\mathbf{4\\times}$ larger batch size, bringing $\\mathbf{2.35\\times \\sim 3.47\\times}$ throughput on real LLM inference workload. The source code is available at https://github.com/jy-yuan/KIVI.", "year": 2024, "citation_count": 392, "paper_id": "a3e000e0d7f64c1d094c2a8bf6f43992cbabe91b", "authors": ["Zirui Liu", "Jiayi Yuan", "Hongye Jin", "Shaochen Zhong", "Zhaozhuo Xu", "Vladimir Braverman", "Beidi Chen", "Xia Hu"], "references": ["fbfe920579cc1c13358521d403cfce31f2afbead", "db52731a663fae1582eecdd16e88db213f8b2a74", "c811bedbe8f4c21d0cba9f9175f7c2eb203284a7", "b085968c4362fb286ad6c5ef71a5db9630da0498", "9529e50807f36acf3d2e4af994b5803c47e4746a", "db633c6b1c286c0386f0078d8a2e6224e03a6227", "fdc53c2c10742464087c0525f77e32604827a21d", "83b90f4a0ae4cc214eb3cc140ccfef9cd99fac05", "b31a5884a8ebe96b6300839b28608b97f8f8ef76", "104b0bb1da562d53cbda87aec79ef6a2827d191a", "3b7ef6f9f27e33e6a4e3bfac90dcb01ab09718bc", "0423fc7bc1880b850d07aec8ebd9217a70626572", "7a1e71cb1310c4a873e7a4e54d1a6dab0553adce", "d6eeb2898bd9bd34744194ef543062dda6c4531a", "50eb97f832ffcd2114f79957c977215176384e3d", "60b35c6d68acced19b0c66edcfc0ee0a2c11efed", "5ae6fb6b5a3c7df515ff4a82ac9673bae6a8e200", "5f187af087ebbaf1ce4bca686a4b1c2afee92b6d", "15dd43ded15e6dbf750278430bd822ee2d1b977f", "42a14d824caa3348046eb34c37e2ab7985faa7a3", "57e849d0de13ed5f91d086936296721d4ff75a75", "2c994fadbb84fb960d8306ee138dbeef41a5b323", "7d645a3fd276918374fd9483fd675c28e46506d1", "7da0f2501034522e3d50af7e9b8fa7ec9d7b65b6", "4be7d1524edb0137599a5cc95f72844b85a52fe1", "90abbc2cf38462b954ae1b772fac9532e2ccd8b0", "dc52b09089704ebd6f471177474bc29741c50023", "2b7c9fd2a94deaee3e7e56dc57bab0bd39d3683c", "b378e54c88d241aa917131beb65c96be3730f40c"], "hybrid_score": 0.6509, "source": "semantic_scholar"}, {"title": "KVQuant: Towards 10 Million Context Length LLM Inference with KV Cache Quantization", "abstract": "LLMs are seeing growing use for applications which require large context windows, and with these large context windows KV cache activations surface as the dominant contributor to memory consumption during inference. Quantization is a promising approach for compressing KV cache activations; however, existing solutions fail to represent activations accurately in sub-4-bit precision. Our work, KVQuant, facilitates low precision KV cache quantization by incorporating several novel methods: (i) Per-Channel Key Quantization, where we adjust the dimension along which we quantize the Key activations to better match the distribution; (ii) Pre-RoPE Key Quantization, where we quantize Key activations before the rotary positional embedding to mitigate its impact on quantization; (iii) Non-Uniform KV Cache Quantization, where we derive per-layer sensitivity-weighted non-uniform datatypes that better represent the distributions; and (iv) Per-Vector Dense-and-Sparse Quantization, where we isolate outliers separately for each vector to minimize skews in quantization ranges. By applying our method to the LLaMA, Llama-2, Llama-3, and Mistral models, we achieve<0.1 perplexity degradation with 3-bit quantization on both Wikitext-2 and C4, outperforming existing approaches. Our method enables serving LLaMA-7B with a context length of up to 1 million on a single A100-80GB GPU and up to 10 million on an 8-GPU system. We develop custom CUDA kernels for KVQuant, showing that we can achieve up to ~1.7x speedups, compared to baseline fp16 matrix-vector multiplications, for the LLaMA-7B model.", "year": 2024, "citation_count": 434, "paper_id": "b085968c4362fb286ad6c5ef71a5db9630da0498", "authors": ["Coleman Hooper", "Sehoon Kim", "Hiva Mohammadzadeh", "Michael W. Mahoney", "Y. Shao", "Kurt Keutzer", "A. Gholami"], "references": ["ac5824e9ff924a937d9eef379d0b581de2417678", "4c14b1c41cb0aaa68f5d3f4a432f55e7199657ea", "713806165610c237f551a7b68e6b09b3ded75502", "9529e50807f36acf3d2e4af994b5803c47e4746a", "db633c6b1c286c0386f0078d8a2e6224e03a6227", "6c323c535365e1c7cbfd9703cbec3b5650a3346b", "fdc53c2c10742464087c0525f77e32604827a21d", "b6346f9fa093b8e85df712485a2b851b9f680dac", "73290ecbec2f38d1d647ddef1ada69cee41725b3", "464cf829eaaeb2b3bafc84cc9203790e95102049", "b31a5884a8ebe96b6300839b28608b97f8f8ef76", "eb2c2330177f765038a2b17e2ee3498965865797", "104b0bb1da562d53cbda87aec79ef6a2827d191a", "f5afaccfe90268485a9961c5771ec5e71e9b806c", "3b7ef6f9f27e33e6a4e3bfac90dcb01ab09718bc", "51db4c39dc0bdf5c95c8bbe89bf4211b48d0b4df", "6bd3ee1ca608bc66a490f63f2fb107d79b44f3e2", "d6eeb2898bd9bd34744194ef543062dda6c4531a", "60b35c6d68acced19b0c66edcfc0ee0a2c11efed", "32ac52069e562d4f900afee70bdca63f53461481", "42a14d824caa3348046eb34c37e2ab7985faa7a3", "0a6906bd6f026d3da3031c641ed03081bd0b574e", "57e849d0de13ed5f91d086936296721d4ff75a75", "2c994fadbb84fb960d8306ee138dbeef41a5b323", "3f6243097a58e386aea1215fed4f372dee07a100", "4be7d1524edb0137599a5cc95f72844b85a52fe1", "73bcf4577284fa116ee73487b7cbb85c8266eaa0", "66c10bf1f11bc1b2d92204d8f8391d087f6de1c4", "814a4f680b9ba6baba23b93499f4b48af1a27678", "6c4b76232bb72897685d19b3d264c6ee3005bc2b", "1a858b96d2fdfeadf8c0f7126cbd55825223fb9d", "851e437dc09b4abd99bee73ade84b85437d61c4b", "e837dfa120e8ce3cd587bde7b0787ef43fa7832d", "e41c96b0d36c89f6dd1f21ccb80c7cc19af6a1cf", "efbd381493bb9636f489b965a2034d529cd56bcd", "e3aa232577bb427b1f3a34acbdef84bd85734042", "2b7c9fd2a94deaee3e7e56dc57bab0bd39d3683c", "81051b830a4f5606106765902a51ba281c9230f9", "9f0fe125af3cfbad99f1f2a6ada0daf61eef92b1"], "hybrid_score": 0.6081, "source": "semantic_scholar"}, {"title": "Fast-dLLM: Training-free Acceleration of Diffusion LLM by Enabling KV Cache and Parallel Decoding", "abstract": "Diffusion-based large language models (Diffusion LLMs) have shown promise for non-autoregressive text generation with parallel decoding capabilities. However, the practical inference speed of open-sourced Diffusion LLMs often lags behind autoregressive models due to the lack of Key-Value (KV) Cache and quality degradation when decoding multiple tokens simultaneously. To bridge this gap, we introduce a novel block-wise approximate KV Cache mechanism tailored for bidirectional diffusion models, enabling cache reuse with negligible performance drop. Additionally, we identify the root cause of generation quality degradation in parallel decoding as the disruption of token dependencies under the conditional independence assumption. To address this, we propose a confidence-aware parallel decoding strategy that selectively decodes tokens exceeding a confidence threshold, mitigating dependency violations and maintaining generation quality. Experimental results on LLaDA and Dream models across multiple LLM benchmarks demonstrate up to \\textbf{27.6$\\times$ throughput} improvement with minimal accuracy loss, closing the performance gap with autoregressive models and paving the way for practical deployment of Diffusion LLMs.", "year": 2025, "citation_count": 208, "paper_id": "5e9ffdd179df49be1129e32ae75fc89a2b68e676", "authors": ["Chengyue Wu", "Hao Zhang", "Shuchen Xue", "Zhijian Liu", "Shizhe Diao", "Ligeng Zhu", "Ping Luo", "Song Han", "Enze Xie"], "references": ["4a230ff814e6492da8c7dbd1b3d44842df1be989", "c6f896aa698b2d65160372bce057ea5f081904de", "83e3bb785cc56093b9cdac84e9bcacdb0f092d44", "7ffe883cf9ccfc8f212231d4f4b137d5096013e9", "1746d7b2c5c9a0351d51acbc636ac9c4b0eefd9a", "0d11a9674b68216b92e08cf7617a93fbd3fb91f4", "c3f18504c770549771b29c222db41b343dd1f36d", "b5c4c2f88fac2ca41854bbe8ae9740b9b03a0d3a", "385f0c77e72f95e7df74e98b4ff6b9ab19d52f93", "6a33c42eb058a04f3ae4ea08277d364522d0b0b8", "e85213146c7f50474c72116f33d36196c1c2e857", "a9ac6c4b165d8c4bf1a949d0931e7ca7a21fc55b", "f8d357d38bbcdd93889fe71762eb57842b2ab063", "7a73cc4bc0ce80661b5acbb51bbde30ea5167280", "1bed8c7541381b1f79027c240b64c9276573fc3c", "e8430dcca698b9f012615157c2a81b4983a10038", "9cbbb250a565228ba328038ee7944b89cff53e84", "a0a79dad89857a96f8f71b14238e5237cbfc4787", "3a22aad6c18a9559be3bbb197494b434b872a05a", "6d1433f3342fbee85ad1e2809e62734aec5c3853", "33433e9103b00aa0c42597cbfe13a429fbf5abdf", "a979742220a88b1d32e1fbe72c41e8ba3007053c", "3d8a2753649f3c493e2c237b0f4049858e958ae6", "e9b9a47cd81c66603c827f0f2bc4fba0d9ae77c4", "91b32fc0a23f0af53229fceaae9cce43a0406d2e", "2cd605106b88c85d7d8b865b1ef0f8c8293debf1", "add5f3f820b393e7ce5ed467814253824ecc484b", "395de0bd3837fdf4b4b5e5f04835bcc69c279481", "204e3073870fae3d05bcbc2f6a8e263d9b72e776", "2dcef55a07f8607a819c21fe84131ea269cc2e3c", "cfa3cb6b21410352a9aacb991cb0bd7e7d6a4ec0", "13bd881530d003403325cd2de57d8af42acd0d81"], "hybrid_score": 0.5691, "source": "semantic_scholar"}, {"title": "X-EcoMLA: Upcycling Pre-Trained Attention into MLA for Efficient and Extreme KV Compression", "abstract": "Multi-head latent attention (MLA) is designed to optimize KV cache memory through low-rank key-value joint compression. Rather than caching keys and values separately, MLA stores their compressed latent representations, reducing memory overhead while maintaining the performance. While MLA improves memory efficiency without compromising language model accuracy, its major limitation lies in its integration during the pre-training phase, requiring models to be trained from scratch. This raises a key question: can we use MLA's benefits fully or partially in models that have already been pre-trained with different attention mechanisms? In this paper, we propose X-EcoMLA to deploy post training distillation to enable the upcycling of Transformer-based attention into an efficient hybrid MLA variant through lightweight post-training adaptation, bypassing the need for extensive pre-training. We demonstrate that leveraging the dark knowledge of a well-trained model can enhance training accuracy and enable extreme KV cache compression in MLA without compromising model performance. The experimental results show that our proposed method can effectively compress the KV cache while preserving the performance on the benchmarks; specifically, for Llama3.2-1B-Instruct baseline, a 6.4x compression achieves the same average score by using only 3.6B training tokens and 70 GPU hours on AMD MI300, whereas a 10.6x compression have less than 0.1% average score drop with 7B training tokens and 140 GPU hours. The code for this work is available at https://github.com/AMD-AGI/AMD-Hybrid-Models.", "year": 2025, "citation_count": 2, "paper_id": "7aa4f9d94b6c81b8778cdfa7d1d6200f5edd4028", "authors": ["Guihong Li", "Mehdi Rezagholizadeh", "Mingyu Yang", "Vikram V. Appia", "E. Barsoum"], "references": ["4cbf0b9fd18a1850ce588244b073927c372a0d4f", "2eed1fad9bbf887d4395de40f20144c4fafefd7f", "d4fb143e6adbc86e0b200d1d131908db1ff24770", "230762f388c4b6f0e8af4554e2df5fd4248b522d", "bab5d963ba2d59fd74cc22f36bf14924025f1b5a", "40e8af970329135ec95057d73e239dab805ad128", "f06f347b28656a5453af8239a705b02a13c8e28a", "6c67e17fdd763f212e582f8de56b56dd5ed58832", "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2", "022f386eb66fc5532dd6f439e7a356fd33ebb9a2", "dc3b69ed88f62d1a95c25f8cbd01d144b00312f2", "ca9f5b3bf0f54ad97513e6175b30497873670fed", "1d4c48335d841014d0145256c3c4e7f6c426b8fb", "53a803388e83ae89261624099d7be4287ace67cb", "05c1dc502ed51162580ccd320d5668d2fec94a7a", "7a54aad06171f59149aca5380863c62729c70b41", "1759d78e00b811b2b4b35b49e22f7ec11694f5ad", "f4a0c4154203808f362e4678f3741b3d317fdc82", "7bbc7595196a0606a07506c4fb1473e5e87f6082", "fdc53c2c10742464087c0525f77e32604827a21d", "2a38daf98d506477f8180806f503409d5036eaf4", "e586a4591ba0303b769f2c07cbddaf1899cb72e4", "5ae6fb6b5a3c7df515ff4a82ac9673bae6a8e200", "397e0e0d20f00d8fbfecd2fd36b14f13e2181d0e", "814a4f680b9ba6baba23b93499f4b48af1a27678", "90abbc2cf38462b954ae1b772fac9532e2ccd8b0", "925ad2897d1b5decbea320d07e99afa9110e09b2", "04f4e55e14150b7c48b0287ba77c7443df76ed45", "dc52b09089704ebd6f471177474bc29741c50023", "0c3c4c88c7b07596221ac640c7b7102686e3eae3", "8b0f27bb594b1eaaf493eaf1e2ee723a2b0a19ad", "1536e8958697c5364f68b2e2448905dbbeb3a0ca", "88bb0a28bb58d847183ec505dda89b63771bb495", "636a79420d838eabe4af7fb25d6437de45ab64e8", "5b6fdb2aea424d02c141da81d62b04d739d62b96", "92e121c6e114fe3cfb89370df03847c66a9b4e28"], "hybrid_score": 0.527, "source": "semantic_scholar"}]

data/cache/ee737556fd55e47471ca6ec835d5b80f.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"url": "https://arxiv.org/html/2410.00161v2", "snippet": "We propose query-group-compression, a simple yet effective method tocompresstheKVcacheof GQA models without repeating it into the dimension of ...", "title": "KV-Compress: Paged KV-Cache Compression with Variable", "inferred_year": null, "hybrid_score": 0.0, "source": "duckduckgo"}, {"url": "https://arxiv.org/html/2502.01941v2", "snippet": "We hope our work can provide theresearchcommunity with insightful perspectives on the impact ofKVcachecompressionon LLMs.", "title": "Can LLMs Maintain Fundamental Abilities under KV Cache", "inferred_year": null, "hybrid_score": 0.0, "source": "duckduckgo"}, {"url": "https://arxiv.org/html/2603.21576v2", "snippet": "Long-context LLM inference is bottlenecked not by compute but by the memory bandwidth required to scan theKVcacheat every decode step\u2014a cost ...", "title": "PRISM: Photonic Similarity Engine for KV Cache Block Selection", "inferred_year": null, "hybrid_score": 0.0, "source": "duckduckgo"}]