[ { "id": "A001", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What is the current consensus on the primary bottleneck of KV cache in LLM inference?", "source_paper": "MiniCache: KV Cache Compression in Depth Dimension for Large Language Models", "source_year": 2024, "source_id": "d372fb69c485472385f152bc832bf1d35e223324" }, { "id": "A002", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What does the research consensus say about KV cache similarity patterns across transformer layers?", "source_paper": "MiniCache: KV Cache Compression in Depth Dimension for Large Language Models", "source_year": 2024, "source_id": "d372fb69c485472385f152bc832bf1d35e223324" }, { "id": "A003", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What is the established approach for KV cache compression and streaming for long-context LLM serving?", "source_paper": "CacheGen: KV Cache Compression and Streaming for Fast Large Language Model Serving", "source_year": 2023, "source_id": "40e565e070fde823097507fd6830cfa6944df95d" }, { "id": "A004", "category": "A", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "What is the current consensus on how speculative decoding reduces LLM inference latency?", "source_paper": "Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding", "source_year": 2024, "source_id": "0cee098244c9978032702862a43a09f468f691a4" }, { "id": "A005", "category": "A", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "What does the research consensus say about the trade-off between draft model quality and speculative decoding speedup?", "source_paper": "Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding", "source_year": 2024, "source_id": "0cee098244c9978032702862a43a09f468f691a4" }, { "id": "A006", "category": "A", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "What is the established understanding of self-speculative decoding methods for LLM inference acceleration?", "source_paper": "SWIFT: On-the-Fly Self-Speculative Decoding for LLM Inference Acceleration", "source_year": 2024, "source_id": "bacdf9671fb872287201b53d768df89b4d6630a3" }, { "id": "A007", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_merging", "question": "What is the current consensus on adaptive KV cache merging for long-context LLM tasks?", "source_paper": "Model Tells You Where to Merge: Adaptive KV Cache Merging for LLMs on Long-Context Tasks", "source_year": 2024, "source_id": "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2" }, { "id": "A008", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What is the established view on whether KV cache compression degrades reasoning performance in LLMs?", "source_paper": "Hold Onto That Thought: Assessing KV Cache Compression On Reasoning", "source_year": 2025, "source_id": "baca578c4a3dcec8a94d6d045970b5f8cb6ebbac" }, { "id": "A009", "category": "A", "subfield": "llm_efficiency", "topic": "quantization", "question": "What does research consensus say about the accuracy-compression tradeoff in LLM quantization?", "source_paper": "GPTVQ: The Blessing of Dimensionality for LLM Quantization", "source_year": 2024, "source_id": "gptvq_paper_id" }, { "id": "A010", "category": "A", "subfield": "llm_efficiency", "topic": "quantization", "question": "What is the current consensus on whether compressed LLMs can perform agentic tasks effectively?", "source_paper": "Can Compressed LLMs Truly Act? An Empirical Evaluation of Agentic Capabilities", "source_year": 2025, "source_id": "compressed_llm_agent_id" }, { "id": "A011", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What is the established understanding of token importance criteria for KV cache eviction policies?", "source_paper": "LagKV: Lag-Relative Information of the KV Cache Tells Which Tokens Are Important", "source_year": 2025, "source_id": "50a2c39150d4faca53f4990ddf3d479ff570be23" }, { "id": "A012", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What does the literature agree on regarding KV cache compression limitations for autoregressive transformers?", "source_paper": "Compression Barriers for Autoregressive Transformers", "source_year": 2025, "source_id": "compression_barriers_id" }, { "id": "A013", "category": "A", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "What is the consensus on multi-draft speculative decoding versus single-draft methods?", "source_paper": "Towards Optimal Multi-draft Speculative Decoding", "source_year": 2025, "source_id": "multidraft_spec_id" }, { "id": "A014", "category": "A", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "What is the established view on using self-verification to improve speculative decoding acceptance rates?", "source_paper": "Draft Model Knows When to Stop: Self-Verification Speculative Decoding", "source_year": 2024, "source_id": "selfverify_spec_id" }, { "id": "A015", "category": "A", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "What is the current consensus on semantic chunking strategies for KV cache-related memory optimization in vision-language models?", "source_paper": "ZipVL: Efficient Large Vision-Language Models with Dynamic Token Sparsification and KV Cache Compression", "source_year": 2024, "source_id": "7bf7b9d24eaef1d30f77cda4f4489e36a8329ee9" }, { "id": "A016", "category": "A", "subfield": "training_methods", "topic": "rlhf", "question": "What is the current consensus on the core mechanism and purpose of RLHF for LLM alignment?", "source_paper": "A Survey of Reinforcement Learning from Human Feedback", "source_year": 2023, "source_id": "rlhf_survey_2023_id" }, { "id": "A017", "category": "A", "subfield": "training_methods", "topic": "rlhf", "question": "What does the research consensus say about the role of reward models in RLHF pipelines?", "source_paper": "A Survey of Reinforcement Learning from Human Feedback", "source_year": 2023, "source_id": "rlhf_survey_2023_id" }, { "id": "A018", "category": "A", "subfield": "training_methods", "topic": "rlhf", "question": "What is the established understanding of length bias as a failure mode in RLHF reward modeling?", "source_paper": "Bias Fitting to Mitigate Length Bias of Reward Model in RLHF", "source_year": 2025, "source_id": "length_bias_rlhf_id" }, { "id": "A019", "category": "A", "subfield": "training_methods", "topic": "lora", "question": "What is the consensus on the fundamental mechanism of LoRA for parameter-efficient fine-tuning?", "source_paper": "LoRA: Low-Rank Adaptation of Large Language Models", "source_year": 2021, "source_id": "lora_original_id" }, { "id": "A020", "category": "A", "subfield": "training_methods", "topic": "lora", "question": "What does the survey literature say about the current state of LoRA variants for large language models?", "source_paper": "A survey on LoRA of large language models", "source_year": 2024, "source_id": "lora_survey_2024_id" }, { "id": "A021", "category": "A", "subfield": "training_methods", "topic": "lora", "question": "What is the established view on the memory efficiency benefits of LoRA compared to full fine-tuning?", "source_paper": "LoRA: Low-Rank Adaptation of Large Language Models", "source_year": 2021, "source_id": "lora_original_id" }, { "id": "A022", "category": "A", "subfield": "training_methods", "topic": "dpo", "question": "What is the current consensus on how DPO differs from PPO in the RLHF alignment pipeline?", "source_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "source_year": 2023, "source_id": "dpo_original_id" }, { "id": "A023", "category": "A", "subfield": "training_methods", "topic": "dpo", "question": "What does research consensus say about the training stability advantages of DPO over PPO?", "source_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "source_year": 2023, "source_id": "dpo_original_id" }, { "id": "A024", "category": "A", "subfield": "training_methods", "topic": "qlora", "question": "What is the established consensus on QLoRA's approach to efficient fine-tuning of quantized LLMs?", "source_paper": "QLoRA: Efficient Finetuning of Quantized LLMs", "source_year": 2023, "source_id": "qlora_original_id" }, { "id": "A025", "category": "A", "subfield": "training_methods", "topic": "qlora", "question": "What does the literature say about QLoRA's ability to match full fine-tuning performance at reduced memory cost?", "source_paper": "QLoRA: Efficient Finetuning of Quantized LLMs", "source_year": 2023, "source_id": "qlora_original_id" }, { "id": "A026", "category": "A", "subfield": "training_methods", "topic": "loftq", "question": "What is the established understanding of LoftQ's approach to aligning quantization with LoRA fine-tuning?", "source_paper": "LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models", "source_year": 2023, "source_id": "loftq_id" }, { "id": "A027", "category": "A", "subfield": "training_methods", "topic": "rlhf", "question": "What is the current consensus on how direct alignment methods unify preference optimization theory?", "source_paper": "From RLHF to Direct Alignment: A Theoretical Unification of Preference Learning", "source_year": 2026, "source_id": "rlhf_unification_id" }, { "id": "A028", "category": "A", "subfield": "training_methods", "topic": "dpo", "question": "What is the established understanding of gradient imbalance as a limitation in DPO training?", "source_paper": "Gradient Imbalance in Direct Preference Optimization", "source_year": 2025, "source_id": "gradient_imbalance_dpo_id" }, { "id": "A029", "category": "A", "subfield": "training_methods", "topic": "lora", "question": "What does the literature agree on regarding the trade-off between LoRA rank and fine-tuning performance?", "source_paper": "A survey on LoRA of large language models", "source_year": 2024, "source_id": "lora_survey_2024_id" }, { "id": "A030", "category": "A", "subfield": "training_methods", "topic": "rlhf", "question": "What is the consensus on reward hacking as a fundamental challenge in RLHF systems?", "source_paper": "Adversarial Preference Learning for Robust LLM Alignment", "source_year": 2025, "source_id": "adversarial_pref_id" }, { "id": "A031", "category": "A", "subfield": "rag", "topic": "rag_overview", "question": "What is the current consensus on the core benefit of RAG over purely parametric LLMs for knowledge-intensive tasks?", "source_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "source_year": 2020, "source_id": "659bf9ce7175e1ec266ff54359e2bd76e0b7ff31" }, { "id": "A032", "category": "A", "subfield": "rag", "topic": "rag_evaluation", "question": "What does the survey literature say about the current state of RAG evaluation methodologies?", "source_paper": "Evaluation of Retrieval-Augmented Generation: A Survey", "source_year": 2024, "source_id": "rag_eval_survey_id" }, { "id": "A033", "category": "A", "subfield": "rag", "topic": "graph_rag", "question": "What is the established consensus on how graph-based RAG improves over flat vector retrieval?", "source_paper": "Graph Retrieval-Augmented Generation: A Survey", "source_year": 2024, "source_id": "graph_rag_survey_id" }, { "id": "A034", "category": "A", "subfield": "rag", "topic": "rag_overview", "question": "What is the current consensus on the main failure modes of standard RAG pipelines?", "source_paper": "Retrieval-Augmented Generation for AI-Generated Content: A Survey", "source_year": 2024, "source_id": "rag_aigc_survey_id" }, { "id": "A035", "category": "A", "subfield": "rag", "topic": "dense_retrieval", "question": "What does research consensus say about the effectiveness of dense retrieval versus sparse retrieval for RAG systems?", "source_paper": "Sparse Meets Dense: A Hybrid Approach to Enhance Scientific Document Retrieval", "source_year": 2024, "source_id": "sparse_dense_hybrid_id" }, { "id": "A036", "category": "A", "subfield": "rag", "topic": "semantic_chunking", "question": "What is the established understanding of whether semantic chunking outperforms fixed-size chunking in RAG systems?", "source_paper": "Is Semantic Chunking Worth the Computational Cost?", "source_year": 2024, "source_id": "cbf04ffc43b73d315b7ba1c45866bc4eab68ebfc" }, { "id": "A037", "category": "A", "subfield": "rag", "topic": "hybrid_retrieval", "question": "What is the current consensus on combining dense and sparse retrieval methods in RAG pipelines?", "source_paper": "DAT: Dynamic Alpha Tuning for Hybrid Retrieval in Retrieval-Augmented Generation", "source_year": 2025, "source_id": "4e57d2fa070c8c9e2f36341c5f86e35b6ca33f2c" }, { "id": "A038", "category": "A", "subfield": "rag", "topic": "rag_long_context", "question": "What is the established consensus on when RAG outperforms long-context LLMs for knowledge retrieval?", "source_paper": "LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs", "source_year": 2025, "source_id": "b8034f821c2a870d87d20f9f9227e3ffd8f81521" }, { "id": "A039", "category": "A", "subfield": "rag", "topic": "graph_rag", "question": "What does the literature agree on regarding graph RAG for customized domain-specific LLMs?", "source_paper": "A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models", "source_year": 2025, "source_id": "908d45b0d2b88ba72ee501c368eb618d29d61ce0" }, { "id": "A040", "category": "A", "subfield": "rag", "topic": "hybrid_retrieval", "question": "What is the consensus on prompt-based methods for generating both dense and sparse representations for retrieval?", "source_paper": "PromptReps: Prompting Large Language Models to Generate Dense and Sparse Representations for Zero-Shot Document Retrieval", "source_year": 2024, "source_id": "ee8918225cc3c558b07cada34ac366a9dc081bdd" }, { "id": "A041", "category": "A", "subfield": "rag", "topic": "rag_overview", "question": "What does the survey literature agree on regarding RAG for natural language processing tasks?", "source_paper": "Retrieval-Augmented Generation for Natural Language Processing: A Survey", "source_year": 2024, "source_id": "rag_nlp_survey_id" }, { "id": "A042", "category": "A", "subfield": "rag", "topic": "rag_overview", "question": "What is the current consensus on RAG and its extensions for AI-generated content quality?", "source_paper": "Retrieval Augmented Generation (RAG) and Beyond: A Comprehensive Survey", "source_year": 2024, "source_id": "rag_beyond_survey_id" }, { "id": "A043", "category": "A", "subfield": "rag", "topic": "chunking", "question": "What does the literature agree on regarding the impact of chunking strategies on domain-specific RAG performance?", "source_paper": "The Impact of Chunking Strategies on Domain-Specific Information Retrieval", "source_year": 2025, "source_id": "chunking_domain_id" }, { "id": "A044", "category": "A", "subfield": "rag", "topic": "rag_overview", "question": "What is the established consensus on RAG for automating systematic literature reviews?", "source_paper": "Automating Systematic Literature Reviews with Retrieval-Augmented Generation", "source_year": 2024, "source_id": "rag_lit_review_id" }, { "id": "A045", "category": "A", "subfield": "rag", "topic": "hybrid_retrieval", "question": "What is the established view on whether small embedding models with LLM re-ranking can beat large embedding models in hybrid retrieval?", "source_paper": "Rethinking Hybrid Retrieval: When Small Embeddings and LLM Re-ranking Beat Bigger Models", "source_year": 2025, "source_id": "11bb1e4137b1c1daf11464bd7b4750cb0b5db8d8" }, { "id": "A046", "category": "A", "subfield": "training_methods", "topic": "dpo", "question": "What is the consensus on token-level reward guidance as an improvement to standard DPO?", "source_paper": "TGDPO: Harnessing Token-Level Reward Guidance for Enhancing Direct Preference Optimization", "source_year": 2025, "source_id": "tgdpo_id" }, { "id": "A047", "category": "A", "subfield": "training_methods", "topic": "lora", "question": "What does the literature agree on about applying LoRA to code-generation LLMs?", "source_paper": "Aligning CodeLLMs with Direct Preference Optimization", "source_year": 2024, "source_id": "code_dpo_id" }, { "id": "A048", "category": "A", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "What is the established view on lossless speculative decoding for diffusion language models?", "source_paper": "Spiffy: Multiplying Diffusion LLM Acceleration via Lossless Speculative Decoding", "source_year": 2025, "source_id": "2a9c37efd3b943e58f0cf56ee91c9ff7894546cb" }, { "id": "A049", "category": "A", "subfield": "rag", "topic": "rag_overview", "question": "What is the current consensus on the limitations of RAG for non-factoid question answering?", "source_paper": "Typed-RAG: Type-Aware Decomposition of Non-Factoid Questions for Retrieval-Augmented Generation", "source_year": 2025, "source_id": "eb4df9446d932b422c68633836611e63be06d0e1" }, { "id": "A050", "category": "A", "subfield": "rag", "topic": "graph_rag", "question": "What does the literature agree on regarding graph-based RAG for large language model customization in specialized domains?", "source_paper": "Graph Retrieval-Augmented Generation for Large Language Models: A Survey", "source_year": 2024, "source_id": "graph_rag2_survey_id" }, { "id": "B001", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "kv_cache_eviction", "question": "What were the dominant KV cache eviction strategies before adaptive merging methods, and what superseded them?", "older_paper": "Heavy-Hitter Oracle (H2O): Efficient Generative Inference of Large Language Models", "older_year": 2023, "newer_paper": "Model Tells You Where to Merge: Adaptive KV Cache Merging for LLMs on Long-Context Tasks", "newer_year": 2024, "newer_id": "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2" }, { "id": "B002", "category": "B", "supersession_type": "hard", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did depth-dimension KV cache compression challenge the established layer-independent compression approach?", "older_paper": "Layer-independent KV cache compression (StreamingLLM-style eviction)", "older_year": 2023, "newer_paper": "MiniCache: KV Cache Compression in Depth Dimension for Large Language Models", "newer_year": 2024, "newer_id": "d372fb69c485472385f152bc832bf1d35e223324" }, { "id": "B003", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "How did self-speculative decoding methods supersede the requirement for a separate draft model?", "older_paper": "Speculative Decoding (original draft-model approach)", "older_year": 2022, "newer_paper": "SWIFT: On-the-Fly Self-Speculative Decoding for LLM Inference Acceleration", "newer_year": 2024, "newer_id": "bacdf9671fb872287201b53d768df89b4d6630a3" }, { "id": "B004", "category": "B", "supersession_type": "hard", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "How did CTC-based draft models demonstrate limitations in standard autoregressive draft models for speculative decoding?", "older_paper": "Standard autoregressive draft model speculative decoding", "older_year": 2022, "newer_paper": "Speculative Decoding with CTC-based Draft Model for LLM Inference Acceleration", "newer_year": 2024, "newer_id": "3230ed476488a459d27efc22e8cc5eb4d0298c4f" }, { "id": "B005", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "How did multi-draft speculative decoding improve over the original single-draft paradigm?", "older_paper": "Original speculative decoding with single draft", "older_year": 2022, "newer_paper": "Towards Optimal Multi-draft Speculative Decoding", "newer_year": 2025, "newer_id": "multidraft_spec_id" }, { "id": "B006", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "rlhf_vs_dpo", "question": "How did DPO show that PPO-based RLHF is not necessary for preference alignment in LLMs?", "older_paper": "Proximal Policy Optimization (PPO) for RLHF alignment", "older_year": 2022, "newer_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "newer_year": 2023, "newer_id": "dpo_original_id" }, { "id": "B007", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "qlora_vs_lora", "question": "How did QLoRA demonstrate that 4-bit quantized fine-tuning could match full 16-bit LoRA performance?", "older_paper": "LoRA: Low-Rank Adaptation of Large Language Models", "older_year": 2021, "newer_paper": "QLoRA: Efficient Finetuning of Quantized LLMs", "newer_year": 2023, "newer_id": "qlora_original_id" }, { "id": "B008", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "loftq_vs_qlora", "question": "How did LoftQ identify and address the initialization gap that limits QLoRA fine-tuning quality?", "older_paper": "QLoRA: Efficient Finetuning of Quantized LLMs", "older_year": 2023, "newer_paper": "LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models", "newer_year": 2023, "newer_id": "loftq_id" }, { "id": "B009", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "dpo_variants", "question": "How did hybrid rejection sampling DPO address the distribution shift problem in standard DPO?", "older_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "older_year": 2023, "newer_paper": "RS-DPO: A Hybrid Rejection Sampling and Direct Preference Optimization", "newer_year": 2024, "newer_id": "rsdpo_id" }, { "id": "B010", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "rlhf_reward_hacking", "question": "How did adversarial preference learning expose vulnerabilities in standard RLHF reward models?", "older_paper": "Standard RLHF with fixed reward model", "older_year": 2022, "newer_paper": "Adversarial Preference Learning for Robust LLM Alignment", "newer_year": 2025, "newer_id": "adversarial_pref_id" }, { "id": "B011", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "chunking_strategies", "question": "How did semantic chunking challenge the assumption that fixed-size chunking is sufficient for RAG retrieval quality?", "older_paper": "Fixed-size chunking for RAG (early RAG implementations)", "older_year": 2021, "newer_paper": "Is Semantic Chunking Worth the Computational Cost?", "newer_year": 2024, "newer_id": "cbf04ffc43b73d315b7ba1c45866bc4eab68ebfc" }, { "id": "B012", "category": "B", "supersession_type": "hard", "subfield": "rag", "topic": "hybrid_retrieval", "question": "How did hybrid dense-sparse retrieval demonstrate limitations in relying solely on dense vector retrieval?", "older_paper": "Dense Passage Retrieval (DPR)", "older_year": 2020, "newer_paper": "Sparse Meets Dense: A Hybrid Approach to Enhance Scientific Document Retrieval", "newer_year": 2024, "newer_id": "sparse_dense_hybrid_id" }, { "id": "B013", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "rag_vs_long_context", "question": "How did long-context LLMs challenge the assumption that RAG is always the best approach for external knowledge integration?", "older_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "older_year": 2020, "newer_paper": "LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs", "newer_year": 2025, "newer_id": "b8034f821c2a870d87d20f9f9227e3ffd8f81521" }, { "id": "B014", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "graph_rag", "question": "How did graph-based RAG supersede flat vector retrieval for multi-hop reasoning tasks?", "older_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "older_year": 2020, "newer_paper": "Graph Retrieval-Augmented Generation: A Survey", "newer_year": 2024, "newer_id": "graph_rag_survey_id" }, { "id": "B015", "category": "B", "supersession_type": "hard", "subfield": "rag", "topic": "dense_retrieval", "question": "How did prompt-based unified dense-sparse representations challenge separate dense and sparse retrieval systems?", "older_paper": "Separate dense (DPR) and sparse (BM25) retrieval systems", "older_year": 2020, "newer_paper": "PromptReps: Prompting Large Language Models to Generate Dense and Sparse Representations for Zero-Shot Document Retrieval", "newer_year": 2024, "newer_id": "ee8918225cc3c558b07cada34ac366a9dc081bdd" }, { "id": "B016", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "hybrid_retrieval", "question": "How did dynamic alpha tuning improve over static weight assignment in hybrid dense-sparse retrieval for RAG?", "older_paper": "Static weight hybrid retrieval (fixed BM25 + dense weights)", "older_year": 2022, "newer_paper": "DAT: Dynamic Alpha Tuning for Hybrid Retrieval in Retrieval-Augmented Generation", "newer_year": 2025, "newer_id": "4e57d2fa070c8c9e2f36341c5f86e35b6ca33f2c" }, { "id": "B017", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did token importance diversity criteria supersede pure importance-only KV cache eviction policies?", "older_paper": "H2O: Heavy-Hitter Oracle for KV Cache Eviction", "older_year": 2023, "newer_paper": "Mixing Importance with Diversity: Joint Optimization for KV Cache Compression in Large Vision-Language Models", "newer_year": 2025, "newer_id": "d31ca000a53fda36cdba549691c08052783173d8" }, { "id": "B018", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "rlhf_alignment", "question": "How did direct alignment methods theoretically subsume PPO-based RLHF as a special case?", "older_paper": "PPO-based RLHF for language model alignment", "older_year": 2022, "newer_paper": "From RLHF to Direct Alignment: A Theoretical Unification of Preference Learning", "newer_year": 2026, "newer_id": "rlhf_unification_id" }, { "id": "B019", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "rag_evaluation", "question": "How did structured RAG evaluation frameworks expose the inadequacy of single-metric retrieval benchmarks?", "older_paper": "Early RAG evaluation using only BLEU/ROUGE", "older_year": 2021, "newer_paper": "Evaluation of Retrieval-Augmented Generation: A Survey", "newer_year": 2024, "newer_id": "rag_eval_survey_id" }, { "id": "B020", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did autoencoder-based KV cache compression supersede simple quantization-only KV compression approaches?", "older_paper": "KV cache quantization (INT8/INT4 KV cache)", "older_year": 2023, "newer_paper": "KV-CAR: KV Cache Compression using Autoencoders and KV Reuse in Large Language Models", "newer_year": 2025, "newer_id": "78a7d0994ab6fb4ccafdfb87d9e8fc2f171558ef" }, { "id": "B021", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "dpo", "question": "How did analysis of DPO gradient imbalance reveal a previously unrecognized training instability in standard DPO?", "older_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", "older_year": 2023, "newer_paper": "Gradient Imbalance in Direct Preference Optimization", "newer_year": 2025, "newer_id": "gradient_imbalance_dpo_id" }, { "id": "B022", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "How did batch speculative decoding with optimal token selection supersede greedy draft token selection?", "older_paper": "Greedy draft token selection in speculative decoding", "older_year": 2023, "newer_paper": "TETRIS: Optimal Draft Token Selection for Batch Speculative Decoding", "newer_year": 2025, "newer_id": "tetris_spec_id" }, { "id": "B023", "category": "B", "supersession_type": "hard", "subfield": "rag", "topic": "chunking_strategies", "question": "How did multi-overlap chunking demonstrate that non-overlapping semantic chunks miss critical context boundaries?", "older_paper": "Single-boundary semantic chunking for RAG", "older_year": 2023, "newer_paper": "Mix-Of-Overlap: Enhancing Retrieval-Augmented Generation with Multiple Overlapping Chunks", "newer_year": 2025, "newer_id": "mix_overlap_id" }, { "id": "B024", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "lora", "question": "How did layer-wise adaptive LoRA improve over uniform rank assignment across all transformer layers?", "older_paper": "LoRA: Low-Rank Adaptation of Large Language Models", "older_year": 2021, "newer_paper": "La-LoRA: Parameter-efficient fine-tuning with layer-wise adaptive low-rank adaptation", "newer_year": 2025, "newer_id": "lalora_id" }, { "id": "B025", "category": "B", "supersession_type": "hard", "subfield": "rag", "topic": "retrieval_decoupling", "question": "How did cross-attention decoupling of knowledge and context expose inefficiencies in standard RAG architectures?", "older_paper": "Standard concatenation-based RAG architectures", "older_year": 2022, "newer_paper": "Decoupling Knowledge and Context: An Efficient and Effective Retrieval Augmented Generation Framework via Cross Attention", "newer_year": 2025, "newer_id": "d9305383cf1cbea1239f0301c06b314fee43cf4d" }, { "id": "B026", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did adaptive KV cache merging improve over fixed-budget token eviction for long-context tasks?", "older_paper": "Fixed-budget KV eviction methods (H2O, SnapKV)", "older_year": 2023, "newer_paper": "Model Tells You Where to Merge: Adaptive KV Cache Merging for LLMs on Long-Context Tasks", "newer_year": 2024, "newer_id": "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2" }, { "id": "B027", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "lora", "question": "How did QR decomposition-based LoRA initialization expose the sensitivity of standard random initialization in LoRA?", "older_paper": "LoRA: Low-Rank Adaptation of Large Language Models", "older_year": 2021, "newer_paper": "QR-LoRA: QR-Based Low-Rank Adaptation for Efficient Fine-Tuning", "newer_year": 2025, "newer_id": "qrlora_id" }, { "id": "B028", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "graph_rag", "question": "How did graph-structured retrieval address the flat retrieval assumption that all documents are equally connected?", "older_paper": "DPR and flat vector retrieval for RAG", "older_year": 2020, "newer_paper": "Graph Retrieval-Augmented Generation for Large Language Models: A Survey", "newer_year": 2024, "newer_id": "graph_rag2_survey_id" }, { "id": "B029", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "How did draft model alignment with chain-of-thought distillation supersede vanilla draft model training?", "older_paper": "Standard small draft model training for speculative decoding", "older_year": 2023, "newer_paper": "Direct Alignment of Draft Model for Speculative Decoding with Chain-of-Thought Distillation", "newer_year": 2024, "newer_id": "direct_align_draft_id" }, { "id": "B030", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "rlhf", "question": "How did discriminative reward modeling address the attention hacking failure mode in standard RLHF reward models?", "older_paper": "Standard RLHF reward model training", "older_year": 2022, "newer_paper": "Alleviating Attention Hacking in Discriminative Reward Modeling", "newer_year": 2025, "newer_id": "attention_hacking_id" }, { "id": "B031", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "rag_evaluation", "question": "How did RAG and long-context benchmarking reveal that neither approach dominates across all task types?", "older_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", "older_year": 2020, "newer_paper": "LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs - No Silver Bullet", "newer_year": 2025, "newer_id": "b8034f821c2a870d87d20f9f9227e3ffd8f81521" }, { "id": "B032", "category": "B", "supersession_type": "hard", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did theoretical analysis of compression barriers expose practical limits of KV cache reduction in autoregressive models?", "older_paper": "Empirical KV cache compression methods (H2O, MiniCache, SnapKV)", "older_year": 2023, "newer_paper": "Compression Barriers for Autoregressive Transformers", "newer_year": 2025, "newer_id": "compression_barriers_id" }, { "id": "B033", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "lora_vs_full_ft", "question": "How did empirical comparison show LoRA underperforms full fine-tuning for handwritten text recognition despite memory savings?", "older_paper": "LoRA as universal fine-tuning replacement", "older_year": 2021, "newer_paper": "Low-Rank Adaptation vs. Fine-Tuning for Handwritten Text Recognition", "newer_year": 2025, "newer_id": "lora_vs_ft_htr_id" }, { "id": "B034", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "hybrid_retrieval", "question": "How did scalable sparse retrieval in decoder-only LLMs challenge the assumption that sparse retrieval requires encoder models?", "older_paper": "BM25 and sparse retrieval with encoder-based models", "older_year": 2021, "newer_paper": "Scaling Sparse and Dense Retrieval in Decoder-Only LLMs", "newer_year": 2025, "newer_id": "scaling_sparse_dense_id" }, { "id": "B035", "category": "B", "supersession_type": "hard", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did lag-relative token importance scoring supersede attention-score-only KV eviction criteria?", "older_paper": "Attention-score based KV eviction (H2O, SnapKV)", "older_year": 2023, "newer_paper": "LagKV: Lag-Relative Information of the KV Cache Tells Which Tokens Are Important", "newer_year": 2025, "newer_id": "50a2c39150d4faca53f4990ddf3d479ff570be23" }, { "id": "B036", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "dpo", "question": "How did DPO for diffusion models extend and challenge the assumption that preference optimization is only applicable to autoregressive LLMs?", "older_paper": "RLHF and DPO for autoregressive language models only", "older_year": 2023, "newer_paper": "Diffusion Model Alignment Using Direct Preference Optimization", "newer_year": 2023, "newer_id": "diffusion_dpo_id" }, { "id": "B037", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "chunking_strategies", "question": "How did context reconstruction strategies for RAG expose the limitations of forward-only chunking?", "older_paper": "Standard forward-only chunking for RAG", "older_year": 2022, "newer_paper": "Reconstructing Context: Evaluating Advanced Chunking Strategies for RAG", "newer_year": 2025, "newer_id": "reconstructing_context_id" }, { "id": "B038", "category": "B", "supersession_type": "hard", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did KV cache streaming expose the inadequacy of in-memory-only KV cache management for long-context serving?", "older_paper": "In-memory KV cache management for LLM serving", "older_year": 2022, "newer_paper": "CacheGen: KV Cache Compression and Streaming for Fast Large Language Model Serving", "newer_year": 2023, "newer_id": "40e565e070fde823097507fd6830cfa6944df95d" }, { "id": "B039", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "lora", "question": "How did AutoML-based parameter-efficient fine-tuning selection challenge manually chosen LoRA configurations?", "older_paper": "Manually configured LoRA hyperparameters", "older_year": 2021, "newer_paper": "AutoAdapt: On the Application of AutoML for Parameter-Efficient Fine-Tuning", "newer_year": 2025, "newer_id": "autoadapt_id" }, { "id": "B040", "category": "B", "supersession_type": "hard", "subfield": "rag", "topic": "rag_overview", "question": "How did graph RAG for customized LLMs demonstrate that flat document retrieval is insufficient for domain-specific expert knowledge?", "older_paper": "Standard dense vector RAG for domain adaptation", "older_year": 2021, "newer_paper": "A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models", "newer_year": 2025, "newer_id": "908d45b0d2b88ba72ee501c368eb618d29d61ce0" }, { "id": "B041", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did INT8 GPU-accelerated KV cache quantization supersede FP16 KV cache storage as the deployment baseline?", "older_paper": "FP16 KV cache storage baseline for LLM inference", "older_year": 2022, "newer_paper": "GPU-Accelerated INT8 Quantization for KV Cache Compression in Large Language Models", "newer_year": 2026, "newer_id": "gpu_int8_kv_id" }, { "id": "B042", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "hybrid_retrieval", "question": "How did domain-specific hybrid search demonstrate BM25-alone insufficiency for specialized question answering?", "older_paper": "BM25-only retrieval for domain QA", "older_year": 2020, "newer_paper": "Domain-specific Question Answering with Hybrid Search", "newer_year": 2024, "newer_id": "da0b29c1f5d6d7fdb575023d62416e3751314a98" }, { "id": "B043", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "rlhf", "question": "How did length-bias analysis reveal that high RLHF reward scores may reflect verbosity rather than quality?", "older_paper": "Standard RLHF reward model without length normalization", "older_year": 2022, "newer_paper": "Bias Fitting to Mitigate Length Bias of Reward Model in RLHF", "newer_year": 2025, "newer_id": "length_bias_rlhf_id" }, { "id": "B044", "category": "B", "supersession_type": "soft", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "How did speculative decoding with self-verification supersede always-accept draft token strategies?", "older_paper": "Always-accept draft token speculative decoding approaches", "older_year": 2023, "newer_paper": "Draft Model Knows When to Stop: Self-Verification Speculative Decoding", "newer_year": 2024, "newer_id": "selfverify_spec_id" }, { "id": "B045", "category": "B", "supersession_type": "hard", "subfield": "rag", "topic": "rag_overview", "question": "How did decoupled cross-attention RAG demonstrate that standard prepend-and-attend context integration wastes computation?", "older_paper": "Standard prepend-context RAG architecture", "older_year": 2020, "newer_paper": "Decoupling Knowledge and Context: An Efficient and Effective Retrieval Augmented Generation Framework", "newer_year": 2025, "newer_id": "d9305383cf1cbea1239f0301c06b314fee43cf4d" }, { "id": "B046", "category": "B", "supersession_type": "soft", "subfield": "training_methods", "topic": "lora", "question": "How did SPM-LoRA's task-adaptive fine-tuning framework challenge the assumption that a single LoRA configuration generalizes across tasks?", "older_paper": "Single-configuration LoRA for all tasks", "older_year": 2021, "newer_paper": "SPM-LoRA: A Novel Framework for Task-Adaptive Fine-Tuning Using Sparse Plus Mixture LoRA", "newer_year": 2025, "newer_id": "spm_lora_id" }, { "id": "B047", "category": "B", "supersession_type": "hard", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "How did dynamic retrieval KV cache approaches challenge static KV cache management policies?", "older_paper": "Static KV cache management policies (fixed eviction budget)", "older_year": 2023, "newer_paper": "HeteroCache: A Dynamic Retrieval Approach to Heterogeneous KV Cache Management", "newer_year": 2026, "newer_id": "heterocache_id" }, { "id": "B048", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "chunking_strategies", "question": "How did the analysis of chunking techniques across diverse document types challenge single-strategy chunking assumptions?", "older_paper": "Single-strategy chunking (fixed-size or semantic-only)", "older_year": 2022, "newer_paper": "Comparison of Chunking Techniques Across Diverse Document Types in RAG", "newer_year": 2025, "newer_id": "chunking_comparison_id" }, { "id": "B049", "category": "B", "supersession_type": "hard", "subfield": "training_methods", "topic": "qlora", "question": "How did gradient-aware LoftQ improvement expose the suboptimality of standard alternating least squares quantization in fine-tuning?", "older_paper": "LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models", "older_year": 2023, "newer_paper": "GA-LoftQ: Gradient-Aware Alternating Least Squares Framework for Quantized LoRA Fine-Tuning", "newer_year": 2025, "newer_id": "ga_loftq_id" }, { "id": "B050", "category": "B", "supersession_type": "soft", "subfield": "rag", "topic": "graph_rag", "question": "How did heterogeneous document RAG expose the limitations of text-only retrieval pipelines for tables and figures?", "older_paper": "Text-only retrieval for RAG systems", "older_year": 2020, "newer_paper": "TableRAG: A Retrieval Augmented Generation Framework for Heterogeneous Document Reasoning", "newer_year": 2025, "newer_id": "8b0606d354d1452c9893b08f991a2da0f8ea4580" }, { "id": "C001", "category": "C", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "Is there consensus on whether KV cache compression meaningfully degrades reasoning ability in LLMs?", "camps": "Camp A: KV cache compression preserves reasoning performance within acceptable bounds for most benchmarks (MiniCache, SnapKV papers). Camp B: KV cache compression causes disproportionate reasoning degradation that standard benchmarks fail to capture (Hold Onto That Thought, 2025).", "label": "CONTESTED" }, { "id": "C002", "category": "C", "subfield": "llm_efficiency", "topic": "quantization", "question": "Do experts agree on whether quantized LLMs are suitable for agentic task execution?", "camps": "Camp A: Quantization preserves sufficient capability for agentic tasks at 4-bit precision. Camp B: Compressed LLMs exhibit significant capability degradation on multi-step agentic workflows that single-task benchmarks do not reveal (Can Compressed LLMs Truly Act?, 2025).", "label": "CONTESTED" }, { "id": "C003", "category": "C", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "Is there agreement on the theoretical limits of KV cache compression for autoregressive transformers?", "camps": "Camp A: KV cache compression ratios are primarily empirically bounded by task accuracy constraints, with no hard theoretical floor. Camp B: There exist information-theoretic compression barriers for autoregressive transformers that bound achievable compression independently of method choice (Compression Barriers for Autoregressive Transformers, 2025).", "label": "CONTESTED" }, { "id": "C004", "category": "C", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "Do researchers agree on whether speculative decoding is lossless in practice for all LLM architectures?", "camps": "Camp A: Speculative decoding with proper acceptance criteria guarantees lossless output distribution matching the target model. Camp B: Practical implementation of lossless speculative decoding for diffusion LLMs and non-standard architectures requires additional mechanisms not present in original speculative decoding (Spiffy, 2025).", "label": "CONTESTED" }, { "id": "C005", "category": "C", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "Is there consensus on whether attention-score-based token importance is the right criterion for KV cache eviction?", "camps": "Camp A: Attention scores are a reliable and sufficient proxy for token importance in KV eviction decisions. Camp B: Lag-relative and recency-aware token importance signals outperform pure attention scores for long-context KV eviction (LagKV, 2025).", "label": "CONTESTED" }, { "id": "C006", "category": "C", "subfield": "training_methods", "topic": "rlhf_vs_dpo", "question": "Do researchers agree on whether DPO is superior to PPO for LLM alignment across all task types?", "camps": "Camp A: DPO provides more stable and simpler training than PPO and achieves comparable or better alignment quality without a separate reward model. Camp B: PPO retains advantages over DPO on complex reasoning tasks and instruction following where on-policy exploration matters (multiple RLHF survey papers, 2024-2025).", "label": "CONTESTED" }, { "id": "C007", "category": "C", "subfield": "training_methods", "topic": "reward_hacking", "question": "Is there agreement on whether current RLHF reward models are robust to adversarial preference manipulation?", "camps": "Camp A: RLHF reward models with sufficient scale and diverse human feedback are robust to most adversarial inputs. Camp B: Standard RLHF reward models are systematically vulnerable to adversarial preference attacks that exploit reward model blind spots (Adversarial Preference Learning, 2025).", "label": "CONTESTED" }, { "id": "C008", "category": "C", "subfield": "training_methods", "topic": "length_bias", "question": "Do researchers agree on the severity of length bias as a distortion in RLHF reward model training?", "camps": "Camp A: Length bias is a known but manageable artifact in RLHF reward models that can be controlled through dataset curation. Camp B: Length bias is a fundamental and systematic distortion that causes reward models to conflate verbosity with quality, requiring explicit architectural mitigation (Bias Fitting, 2025).", "label": "CONTESTED" }, { "id": "C009", "category": "C", "subfield": "training_methods", "topic": "lora_vs_full_ft", "question": "Is there consensus on whether LoRA consistently matches full fine-tuning performance across diverse tasks?", "camps": "Camp A: LoRA achieves near-full-fine-tuning quality across most NLP tasks with dramatically fewer trainable parameters, making it the default choice. Camp B: LoRA underperforms full fine-tuning on specialized tasks such as handwritten text recognition and structured prediction where task-specific weight updates matter (Low-Rank Adaptation vs. Fine-Tuning for HTR, 2025).", "label": "CONTESTED" }, { "id": "C010", "category": "C", "subfield": "training_methods", "topic": "dpo_stability", "question": "Do researchers agree on whether DPO training is stable and well-calibrated across different model scales?", "camps": "Camp A: DPO is inherently more stable than PPO because it eliminates the separate reward model and online sampling, reducing variance. Camp B: DPO exhibits gradient imbalance and distribution shift problems that destabilize training at scale, requiring hybrid methods like RS-DPO (Gradient Imbalance in DPO, 2025; RS-DPO, 2024).", "label": "CONTESTED" }, { "id": "C011", "category": "C", "subfield": "training_methods", "topic": "qlora_quality", "question": "Is there agreement on whether QLoRA's 4-bit quantization introduces acceptable quality degradation for production use?", "camps": "Camp A: QLoRA achieves full fine-tuning quality within noise thresholds on standard benchmarks, making it production-viable. Camp B: QLoRA's quantization-induced initialization gap causes measurable downstream quality degradation that requires additional correction (LoftQ, 2023; GA-LoftQ, 2025).", "label": "CONTESTED" }, { "id": "C012", "category": "C", "subfield": "rag", "topic": "rag_vs_long_context", "question": "Do researchers agree on whether RAG or long-context LLMs is the better approach for external knowledge integration?", "camps": "Camp A: RAG is more cost-efficient and controllable for knowledge-intensive tasks, and outperforms long-context LLMs on multi-document retrieval. Camp B: Long-context LLMs with sufficient context windows can match or surpass RAG on many tasks, and there is no silver bullet — task routing is required (LaRA, 2025).", "label": "CONTESTED" }, { "id": "C013", "category": "C", "subfield": "rag", "topic": "semantic_chunking", "question": "Is there consensus on whether semantic chunking provides meaningful retrieval quality improvement over fixed-size chunking?", "camps": "Camp A: Semantic chunking consistently improves retrieval precision by preserving coherent units of meaning. Camp B: Semantic chunking's improvements are marginal and task-dependent, often not justifying the added computational overhead (Is Semantic Chunking Worth the Computational Cost?, 2024).", "label": "CONTESTED" }, { "id": "C014", "category": "C", "subfield": "rag", "topic": "hybrid_retrieval", "question": "Do researchers agree on the optimal balance between dense and sparse signals in hybrid retrieval for RAG?", "camps": "Camp A: A fixed interpolation weight between BM25 and dense retrieval scores generalizes well across domains with minimal tuning. Camp B: Dynamic per-query alpha tuning is necessary for robust hybrid retrieval performance across heterogeneous document types (DAT, 2025).", "label": "CONTESTED" }, { "id": "C015", "category": "C", "subfield": "rag", "topic": "graph_rag", "question": "Is there agreement on whether graph-structured retrieval is necessary for multi-hop reasoning in RAG systems?", "camps": "Camp A: Sufficiently large dense retrieval models with re-ranking can handle multi-hop reasoning without explicit graph structure. Camp B: Graph-based RAG is necessary for multi-hop reasoning over relational knowledge because flat retrieval cannot capture cross-document entity relationships (Graph RAG Survey, 2024).", "label": "CONTESTED" }, { "id": "C016", "category": "C", "subfield": "rag", "topic": "retrieval_decoupling", "question": "Do researchers agree on whether cross-attention decoupling of context and knowledge improves RAG efficiency and quality?", "camps": "Camp A: Standard concatenation of retrieved documents with the query is sufficient and simpler to implement. Camp B: Cross-attention decoupling of knowledge and context reduces redundant computation and improves factual grounding (Decoupling Knowledge and Context, 2025).", "label": "CONTESTED" }, { "id": "C017", "category": "C", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "Is there consensus on whether KV cache merging is preferable to KV cache eviction for long-context LLM tasks?", "camps": "Camp A: Token eviction methods are simpler and sufficient, as dropped tokens contribute marginally to final outputs. Camp B: KV cache merging preserves information that eviction permanently destroys, leading to measurably better long-context task accuracy (KVMerge, Model Tells You Where to Merge, 2024).", "label": "CONTESTED" }, { "id": "C018", "category": "C", "subfield": "training_methods", "topic": "rlhf", "question": "Do researchers agree on whether RLHF reward models generalize across cultural and linguistic contexts?", "camps": "Camp A: RLHF models trained on sufficiently diverse human feedback generalize reasonably across cultural contexts. Camp B: RLHF reward models trained predominantly on English and Western feedback exhibit systematic cultural bias that limits cross-cultural alignment (RLHF Cultural Survey, 2025).", "label": "CONTESTED" }, { "id": "C019", "category": "C", "subfield": "rag", "topic": "rag_evaluation", "question": "Is there agreement on what metrics constitute a complete evaluation of RAG system quality?", "camps": "Camp A: Faithfulness and answer relevance (as measured by Ragas) are sufficient for a complete RAG quality evaluation. Camp B: Complete RAG evaluation requires additional dimensions including retrieval precision, context utilization, and temporal relevance that Ragas does not capture (Evaluation of RAG Survey, 2024).", "label": "CONTESTED" }, { "id": "C020", "category": "C", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "Do researchers agree on whether speculative decoding provides consistent speedup across all LLM serving scenarios?", "camps": "Camp A: Speculative decoding provides reliable 2-3x speedup across batch sizes and input types when draft and target models are well-matched. Camp B: Speculative decoding speedup degrades significantly under high-batch serving scenarios and with diverse prompts, where draft model acceptance rates drop substantially (survey papers on speculative decoding, 2024).", "label": "CONTESTED" }, { "id": "C021", "category": "C", "subfield": "training_methods", "topic": "lora", "question": "Is there consensus on the optimal rank configuration for LoRA across different model sizes and task types?", "camps": "Camp A: Low ranks (r=4 to r=16) generalize well across most tasks and model sizes, making manual rank selection straightforward. Camp B: Optimal LoRA rank is highly task- and layer-dependent, requiring adaptive rank assignment methods for peak performance (La-LoRA, AutoAdapt, 2025).", "label": "CONTESTED" }, { "id": "C022", "category": "C", "subfield": "rag", "topic": "chunking_strategies", "question": "Do researchers agree on whether there is a universally optimal chunking strategy for RAG across document types?", "camps": "Camp A: Semantic chunking with moderate overlap is the best general-purpose strategy for most document types in RAG. Camp B: No single chunking strategy dominates across diverse document types; optimal chunking is document-structure-dependent and requires task-specific configuration (Comparison of Chunking Techniques, 2025).", "label": "CONTESTED" }, { "id": "C023", "category": "C", "subfield": "training_methods", "topic": "rlhf_vs_dpo", "question": "Is there agreement on whether preference alignment methods like DPO can fully replace RLHF for safety-critical alignment?", "camps": "Camp A: DPO and its variants provide sufficient alignment guarantees for safety-critical applications while being more tractable than RLHF. Camp B: Online RL methods like PPO-based RLHF remain necessary for robust safety alignment because DPO's offline nature cannot adapt to distribution shift during deployment (alignment theory papers, 2025-2026).", "label": "CONTESTED" }, { "id": "C024", "category": "C", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "Do researchers agree on whether KV cache compression techniques transfer effectively from standard LLMs to vision-language models?", "camps": "Camp A: KV cache compression methods developed for text-only LLMs transfer directly to vision-language models with minimal adaptation. Camp B: Vision-language models require joint token importance and diversity criteria for KV compression because visual tokens have fundamentally different importance distributions than text tokens (ZipVL, Mixing Importance with Diversity, 2024-2025).", "label": "CONTESTED" }, { "id": "C025", "category": "C", "subfield": "rag", "topic": "hybrid_retrieval", "question": "Is there consensus on whether hybrid retrieval consistently outperforms pure dense retrieval in production RAG systems?", "camps": "Camp A: Hybrid retrieval (BM25 + dense) consistently outperforms pure dense retrieval across domains due to complementary strengths of lexical and semantic matching. Camp B: The advantage of hybrid retrieval over pure dense retrieval is highly dataset-dependent, and for in-domain tasks with strong embeddings, pure dense often matches hybrid (Rethinking Hybrid Retrieval, 2025; PIRB, 2024).", "label": "CONTESTED" }, { "id": "C026", "category": "C", "subfield": "training_methods", "topic": "qlora", "question": "Do researchers agree on whether QLoRA's memory savings justify its use over standard LoRA for all model sizes?", "camps": "Camp A: QLoRA's 4-bit quantization enables fine-tuning of models that would otherwise not fit in GPU memory, with negligible quality cost. Camp B: For models that fit in memory with standard LoRA, QLoRA introduces quantization noise that outweighs its memory benefits, and LoftQ-style corrections are needed (QLoRA, 2023; LoftQ, 2023; GA-LoftQ, 2025).", "label": "CONTESTED" }, { "id": "C027", "category": "C", "subfield": "llm_efficiency", "topic": "speculative_decoding", "question": "Is there agreement on whether draft model alignment improves speculative decoding acceptance rates significantly?", "camps": "Camp A: Standard small language model draft models achieve sufficient token acceptance rates without specialized alignment training. Camp B: Explicit draft model alignment with chain-of-thought distillation or direct alignment methods significantly improves acceptance rates and end-to-end throughput (Direct Alignment of Draft Model, 2024; AdaEAGLE, 2024).", "label": "CONTESTED" }, { "id": "C028", "category": "C", "subfield": "rag", "topic": "graph_rag", "question": "Do researchers agree on the computational overhead justification for graph-based RAG versus flat retrieval RAG?", "camps": "Camp A: Graph RAG's higher computational cost is justified by substantially better multi-hop reasoning quality in complex domain tasks. Camp B: The computational overhead of graph construction and traversal in graph RAG often exceeds its quality benefits for standard question-answering workloads (Graph RAG surveys, 2024).", "label": "CONTESTED" }, { "id": "C029", "category": "C", "subfield": "training_methods", "topic": "rlhf", "question": "Is there consensus on whether RLHF with human feedback scales effectively to multimodal alignment tasks?", "camps": "Camp A: RLHF principles transfer directly to multimodal alignment with appropriate reward signal design for image-text tasks. Camp B: Multimodal RLHF requires substantially different reward modeling and feedback collection than text-only RLHF, and current methods have not solved the cross-modal reward specification problem (Preference Alignment on Diffusion Models Survey, 2025).", "label": "CONTESTED" }, { "id": "C030", "category": "C", "subfield": "llm_efficiency", "topic": "kv_cache_compression", "question": "Do researchers agree on whether KV cache rethinking approaches improve over targeted eviction for long generation tasks?", "camps": "Camp A: Targeted eviction with careful importance scoring is sufficient and simpler than wholesale KV cache architecture rethinking. Camp B: Fundamental rethinking of KV cache compression is required because targeted eviction methods hit an inherent ceiling on long-generation tasks (Rethinking Key-Value Cache Compression Techniques, 2025).", "label": "CONTESTED" } ]