recon / eval /questions.json
MukulRay's picture
Phase 13: HF Spaces deploy ready - verdict logging, clean requirements
6f237d6
[
{
"id": "A001",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What is the current consensus on the primary bottleneck of KV cache in LLM inference?",
"source_paper": "MiniCache: KV Cache Compression in Depth Dimension for Large Language Models",
"source_year": 2024,
"source_id": "d372fb69c485472385f152bc832bf1d35e223324"
},
{
"id": "A002",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What does the research consensus say about KV cache similarity patterns across transformer layers?",
"source_paper": "MiniCache: KV Cache Compression in Depth Dimension for Large Language Models",
"source_year": 2024,
"source_id": "d372fb69c485472385f152bc832bf1d35e223324"
},
{
"id": "A003",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What is the established approach for KV cache compression and streaming for long-context LLM serving?",
"source_paper": "CacheGen: KV Cache Compression and Streaming for Fast Large Language Model Serving",
"source_year": 2023,
"source_id": "40e565e070fde823097507fd6830cfa6944df95d"
},
{
"id": "A004",
"category": "A",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "What is the current consensus on how speculative decoding reduces LLM inference latency?",
"source_paper": "Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding",
"source_year": 2024,
"source_id": "0cee098244c9978032702862a43a09f468f691a4"
},
{
"id": "A005",
"category": "A",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "What does the research consensus say about the trade-off between draft model quality and speculative decoding speedup?",
"source_paper": "Unlocking Efficiency in Large Language Model Inference: A Comprehensive Survey of Speculative Decoding",
"source_year": 2024,
"source_id": "0cee098244c9978032702862a43a09f468f691a4"
},
{
"id": "A006",
"category": "A",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "What is the established understanding of self-speculative decoding methods for LLM inference acceleration?",
"source_paper": "SWIFT: On-the-Fly Self-Speculative Decoding for LLM Inference Acceleration",
"source_year": 2024,
"source_id": "bacdf9671fb872287201b53d768df89b4d6630a3"
},
{
"id": "A007",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_merging",
"question": "What is the current consensus on adaptive KV cache merging for long-context LLM tasks?",
"source_paper": "Model Tells You Where to Merge: Adaptive KV Cache Merging for LLMs on Long-Context Tasks",
"source_year": 2024,
"source_id": "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2"
},
{
"id": "A008",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What is the established view on whether KV cache compression degrades reasoning performance in LLMs?",
"source_paper": "Hold Onto That Thought: Assessing KV Cache Compression On Reasoning",
"source_year": 2025,
"source_id": "baca578c4a3dcec8a94d6d045970b5f8cb6ebbac"
},
{
"id": "A009",
"category": "A",
"subfield": "llm_efficiency",
"topic": "quantization",
"question": "What does research consensus say about the accuracy-compression tradeoff in LLM quantization?",
"source_paper": "GPTVQ: The Blessing of Dimensionality for LLM Quantization",
"source_year": 2024,
"source_id": "gptvq_paper_id"
},
{
"id": "A010",
"category": "A",
"subfield": "llm_efficiency",
"topic": "quantization",
"question": "What is the current consensus on whether compressed LLMs can perform agentic tasks effectively?",
"source_paper": "Can Compressed LLMs Truly Act? An Empirical Evaluation of Agentic Capabilities",
"source_year": 2025,
"source_id": "compressed_llm_agent_id"
},
{
"id": "A011",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What is the established understanding of token importance criteria for KV cache eviction policies?",
"source_paper": "LagKV: Lag-Relative Information of the KV Cache Tells Which Tokens Are Important",
"source_year": 2025,
"source_id": "50a2c39150d4faca53f4990ddf3d479ff570be23"
},
{
"id": "A012",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What does the literature agree on regarding KV cache compression limitations for autoregressive transformers?",
"source_paper": "Compression Barriers for Autoregressive Transformers",
"source_year": 2025,
"source_id": "compression_barriers_id"
},
{
"id": "A013",
"category": "A",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "What is the consensus on multi-draft speculative decoding versus single-draft methods?",
"source_paper": "Towards Optimal Multi-draft Speculative Decoding",
"source_year": 2025,
"source_id": "multidraft_spec_id"
},
{
"id": "A014",
"category": "A",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "What is the established view on using self-verification to improve speculative decoding acceptance rates?",
"source_paper": "Draft Model Knows When to Stop: Self-Verification Speculative Decoding",
"source_year": 2024,
"source_id": "selfverify_spec_id"
},
{
"id": "A015",
"category": "A",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "What is the current consensus on semantic chunking strategies for KV cache-related memory optimization in vision-language models?",
"source_paper": "ZipVL: Efficient Large Vision-Language Models with Dynamic Token Sparsification and KV Cache Compression",
"source_year": 2024,
"source_id": "7bf7b9d24eaef1d30f77cda4f4489e36a8329ee9"
},
{
"id": "A016",
"category": "A",
"subfield": "training_methods",
"topic": "rlhf",
"question": "What is the current consensus on the core mechanism and purpose of RLHF for LLM alignment?",
"source_paper": "A Survey of Reinforcement Learning from Human Feedback",
"source_year": 2023,
"source_id": "rlhf_survey_2023_id"
},
{
"id": "A017",
"category": "A",
"subfield": "training_methods",
"topic": "rlhf",
"question": "What does the research consensus say about the role of reward models in RLHF pipelines?",
"source_paper": "A Survey of Reinforcement Learning from Human Feedback",
"source_year": 2023,
"source_id": "rlhf_survey_2023_id"
},
{
"id": "A018",
"category": "A",
"subfield": "training_methods",
"topic": "rlhf",
"question": "What is the established understanding of length bias as a failure mode in RLHF reward modeling?",
"source_paper": "Bias Fitting to Mitigate Length Bias of Reward Model in RLHF",
"source_year": 2025,
"source_id": "length_bias_rlhf_id"
},
{
"id": "A019",
"category": "A",
"subfield": "training_methods",
"topic": "lora",
"question": "What is the consensus on the fundamental mechanism of LoRA for parameter-efficient fine-tuning?",
"source_paper": "LoRA: Low-Rank Adaptation of Large Language Models",
"source_year": 2021,
"source_id": "lora_original_id"
},
{
"id": "A020",
"category": "A",
"subfield": "training_methods",
"topic": "lora",
"question": "What does the survey literature say about the current state of LoRA variants for large language models?",
"source_paper": "A survey on LoRA of large language models",
"source_year": 2024,
"source_id": "lora_survey_2024_id"
},
{
"id": "A021",
"category": "A",
"subfield": "training_methods",
"topic": "lora",
"question": "What is the established view on the memory efficiency benefits of LoRA compared to full fine-tuning?",
"source_paper": "LoRA: Low-Rank Adaptation of Large Language Models",
"source_year": 2021,
"source_id": "lora_original_id"
},
{
"id": "A022",
"category": "A",
"subfield": "training_methods",
"topic": "dpo",
"question": "What is the current consensus on how DPO differs from PPO in the RLHF alignment pipeline?",
"source_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
"source_year": 2023,
"source_id": "dpo_original_id"
},
{
"id": "A023",
"category": "A",
"subfield": "training_methods",
"topic": "dpo",
"question": "What does research consensus say about the training stability advantages of DPO over PPO?",
"source_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
"source_year": 2023,
"source_id": "dpo_original_id"
},
{
"id": "A024",
"category": "A",
"subfield": "training_methods",
"topic": "qlora",
"question": "What is the established consensus on QLoRA's approach to efficient fine-tuning of quantized LLMs?",
"source_paper": "QLoRA: Efficient Finetuning of Quantized LLMs",
"source_year": 2023,
"source_id": "qlora_original_id"
},
{
"id": "A025",
"category": "A",
"subfield": "training_methods",
"topic": "qlora",
"question": "What does the literature say about QLoRA's ability to match full fine-tuning performance at reduced memory cost?",
"source_paper": "QLoRA: Efficient Finetuning of Quantized LLMs",
"source_year": 2023,
"source_id": "qlora_original_id"
},
{
"id": "A026",
"category": "A",
"subfield": "training_methods",
"topic": "loftq",
"question": "What is the established understanding of LoftQ's approach to aligning quantization with LoRA fine-tuning?",
"source_paper": "LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models",
"source_year": 2023,
"source_id": "loftq_id"
},
{
"id": "A027",
"category": "A",
"subfield": "training_methods",
"topic": "rlhf",
"question": "What is the current consensus on how direct alignment methods unify preference optimization theory?",
"source_paper": "From RLHF to Direct Alignment: A Theoretical Unification of Preference Learning",
"source_year": 2026,
"source_id": "rlhf_unification_id"
},
{
"id": "A028",
"category": "A",
"subfield": "training_methods",
"topic": "dpo",
"question": "What is the established understanding of gradient imbalance as a limitation in DPO training?",
"source_paper": "Gradient Imbalance in Direct Preference Optimization",
"source_year": 2025,
"source_id": "gradient_imbalance_dpo_id"
},
{
"id": "A029",
"category": "A",
"subfield": "training_methods",
"topic": "lora",
"question": "What does the literature agree on regarding the trade-off between LoRA rank and fine-tuning performance?",
"source_paper": "A survey on LoRA of large language models",
"source_year": 2024,
"source_id": "lora_survey_2024_id"
},
{
"id": "A030",
"category": "A",
"subfield": "training_methods",
"topic": "rlhf",
"question": "What is the consensus on reward hacking as a fundamental challenge in RLHF systems?",
"source_paper": "Adversarial Preference Learning for Robust LLM Alignment",
"source_year": 2025,
"source_id": "adversarial_pref_id"
},
{
"id": "A031",
"category": "A",
"subfield": "rag",
"topic": "rag_overview",
"question": "What is the current consensus on the core benefit of RAG over purely parametric LLMs for knowledge-intensive tasks?",
"source_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
"source_year": 2020,
"source_id": "659bf9ce7175e1ec266ff54359e2bd76e0b7ff31"
},
{
"id": "A032",
"category": "A",
"subfield": "rag",
"topic": "rag_evaluation",
"question": "What does the survey literature say about the current state of RAG evaluation methodologies?",
"source_paper": "Evaluation of Retrieval-Augmented Generation: A Survey",
"source_year": 2024,
"source_id": "rag_eval_survey_id"
},
{
"id": "A033",
"category": "A",
"subfield": "rag",
"topic": "graph_rag",
"question": "What is the established consensus on how graph-based RAG improves over flat vector retrieval?",
"source_paper": "Graph Retrieval-Augmented Generation: A Survey",
"source_year": 2024,
"source_id": "graph_rag_survey_id"
},
{
"id": "A034",
"category": "A",
"subfield": "rag",
"topic": "rag_overview",
"question": "What is the current consensus on the main failure modes of standard RAG pipelines?",
"source_paper": "Retrieval-Augmented Generation for AI-Generated Content: A Survey",
"source_year": 2024,
"source_id": "rag_aigc_survey_id"
},
{
"id": "A035",
"category": "A",
"subfield": "rag",
"topic": "dense_retrieval",
"question": "What does research consensus say about the effectiveness of dense retrieval versus sparse retrieval for RAG systems?",
"source_paper": "Sparse Meets Dense: A Hybrid Approach to Enhance Scientific Document Retrieval",
"source_year": 2024,
"source_id": "sparse_dense_hybrid_id"
},
{
"id": "A036",
"category": "A",
"subfield": "rag",
"topic": "semantic_chunking",
"question": "What is the established understanding of whether semantic chunking outperforms fixed-size chunking in RAG systems?",
"source_paper": "Is Semantic Chunking Worth the Computational Cost?",
"source_year": 2024,
"source_id": "cbf04ffc43b73d315b7ba1c45866bc4eab68ebfc"
},
{
"id": "A037",
"category": "A",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "What is the current consensus on combining dense and sparse retrieval methods in RAG pipelines?",
"source_paper": "DAT: Dynamic Alpha Tuning for Hybrid Retrieval in Retrieval-Augmented Generation",
"source_year": 2025,
"source_id": "4e57d2fa070c8c9e2f36341c5f86e35b6ca33f2c"
},
{
"id": "A038",
"category": "A",
"subfield": "rag",
"topic": "rag_long_context",
"question": "What is the established consensus on when RAG outperforms long-context LLMs for knowledge retrieval?",
"source_paper": "LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs",
"source_year": 2025,
"source_id": "b8034f821c2a870d87d20f9f9227e3ffd8f81521"
},
{
"id": "A039",
"category": "A",
"subfield": "rag",
"topic": "graph_rag",
"question": "What does the literature agree on regarding graph RAG for customized domain-specific LLMs?",
"source_paper": "A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models",
"source_year": 2025,
"source_id": "908d45b0d2b88ba72ee501c368eb618d29d61ce0"
},
{
"id": "A040",
"category": "A",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "What is the consensus on prompt-based methods for generating both dense and sparse representations for retrieval?",
"source_paper": "PromptReps: Prompting Large Language Models to Generate Dense and Sparse Representations for Zero-Shot Document Retrieval",
"source_year": 2024,
"source_id": "ee8918225cc3c558b07cada34ac366a9dc081bdd"
},
{
"id": "A041",
"category": "A",
"subfield": "rag",
"topic": "rag_overview",
"question": "What does the survey literature agree on regarding RAG for natural language processing tasks?",
"source_paper": "Retrieval-Augmented Generation for Natural Language Processing: A Survey",
"source_year": 2024,
"source_id": "rag_nlp_survey_id"
},
{
"id": "A042",
"category": "A",
"subfield": "rag",
"topic": "rag_overview",
"question": "What is the current consensus on RAG and its extensions for AI-generated content quality?",
"source_paper": "Retrieval Augmented Generation (RAG) and Beyond: A Comprehensive Survey",
"source_year": 2024,
"source_id": "rag_beyond_survey_id"
},
{
"id": "A043",
"category": "A",
"subfield": "rag",
"topic": "chunking",
"question": "What does the literature agree on regarding the impact of chunking strategies on domain-specific RAG performance?",
"source_paper": "The Impact of Chunking Strategies on Domain-Specific Information Retrieval",
"source_year": 2025,
"source_id": "chunking_domain_id"
},
{
"id": "A044",
"category": "A",
"subfield": "rag",
"topic": "rag_overview",
"question": "What is the established consensus on RAG for automating systematic literature reviews?",
"source_paper": "Automating Systematic Literature Reviews with Retrieval-Augmented Generation",
"source_year": 2024,
"source_id": "rag_lit_review_id"
},
{
"id": "A045",
"category": "A",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "What is the established view on whether small embedding models with LLM re-ranking can beat large embedding models in hybrid retrieval?",
"source_paper": "Rethinking Hybrid Retrieval: When Small Embeddings and LLM Re-ranking Beat Bigger Models",
"source_year": 2025,
"source_id": "11bb1e4137b1c1daf11464bd7b4750cb0b5db8d8"
},
{
"id": "A046",
"category": "A",
"subfield": "training_methods",
"topic": "dpo",
"question": "What is the consensus on token-level reward guidance as an improvement to standard DPO?",
"source_paper": "TGDPO: Harnessing Token-Level Reward Guidance for Enhancing Direct Preference Optimization",
"source_year": 2025,
"source_id": "tgdpo_id"
},
{
"id": "A047",
"category": "A",
"subfield": "training_methods",
"topic": "lora",
"question": "What does the literature agree on about applying LoRA to code-generation LLMs?",
"source_paper": "Aligning CodeLLMs with Direct Preference Optimization",
"source_year": 2024,
"source_id": "code_dpo_id"
},
{
"id": "A048",
"category": "A",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "What is the established view on lossless speculative decoding for diffusion language models?",
"source_paper": "Spiffy: Multiplying Diffusion LLM Acceleration via Lossless Speculative Decoding",
"source_year": 2025,
"source_id": "2a9c37efd3b943e58f0cf56ee91c9ff7894546cb"
},
{
"id": "A049",
"category": "A",
"subfield": "rag",
"topic": "rag_overview",
"question": "What is the current consensus on the limitations of RAG for non-factoid question answering?",
"source_paper": "Typed-RAG: Type-Aware Decomposition of Non-Factoid Questions for Retrieval-Augmented Generation",
"source_year": 2025,
"source_id": "eb4df9446d932b422c68633836611e63be06d0e1"
},
{
"id": "A050",
"category": "A",
"subfield": "rag",
"topic": "graph_rag",
"question": "What does the literature agree on regarding graph-based RAG for large language model customization in specialized domains?",
"source_paper": "Graph Retrieval-Augmented Generation for Large Language Models: A Survey",
"source_year": 2024,
"source_id": "graph_rag2_survey_id"
},
{
"id": "B001",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "kv_cache_eviction",
"question": "What were the dominant KV cache eviction strategies before adaptive merging methods, and what superseded them?",
"older_paper": "Heavy-Hitter Oracle (H2O): Efficient Generative Inference of Large Language Models",
"older_year": 2023,
"newer_paper": "Model Tells You Where to Merge: Adaptive KV Cache Merging for LLMs on Long-Context Tasks",
"newer_year": 2024,
"newer_id": "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2"
},
{
"id": "B002",
"category": "B",
"supersession_type": "hard",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did depth-dimension KV cache compression challenge the established layer-independent compression approach?",
"older_paper": "Layer-independent KV cache compression (StreamingLLM-style eviction)",
"older_year": 2023,
"newer_paper": "MiniCache: KV Cache Compression in Depth Dimension for Large Language Models",
"newer_year": 2024,
"newer_id": "d372fb69c485472385f152bc832bf1d35e223324"
},
{
"id": "B003",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "How did self-speculative decoding methods supersede the requirement for a separate draft model?",
"older_paper": "Speculative Decoding (original draft-model approach)",
"older_year": 2022,
"newer_paper": "SWIFT: On-the-Fly Self-Speculative Decoding for LLM Inference Acceleration",
"newer_year": 2024,
"newer_id": "bacdf9671fb872287201b53d768df89b4d6630a3"
},
{
"id": "B004",
"category": "B",
"supersession_type": "hard",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "How did CTC-based draft models demonstrate limitations in standard autoregressive draft models for speculative decoding?",
"older_paper": "Standard autoregressive draft model speculative decoding",
"older_year": 2022,
"newer_paper": "Speculative Decoding with CTC-based Draft Model for LLM Inference Acceleration",
"newer_year": 2024,
"newer_id": "3230ed476488a459d27efc22e8cc5eb4d0298c4f"
},
{
"id": "B005",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "How did multi-draft speculative decoding improve over the original single-draft paradigm?",
"older_paper": "Original speculative decoding with single draft",
"older_year": 2022,
"newer_paper": "Towards Optimal Multi-draft Speculative Decoding",
"newer_year": 2025,
"newer_id": "multidraft_spec_id"
},
{
"id": "B006",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "rlhf_vs_dpo",
"question": "How did DPO show that PPO-based RLHF is not necessary for preference alignment in LLMs?",
"older_paper": "Proximal Policy Optimization (PPO) for RLHF alignment",
"older_year": 2022,
"newer_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
"newer_year": 2023,
"newer_id": "dpo_original_id"
},
{
"id": "B007",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "qlora_vs_lora",
"question": "How did QLoRA demonstrate that 4-bit quantized fine-tuning could match full 16-bit LoRA performance?",
"older_paper": "LoRA: Low-Rank Adaptation of Large Language Models",
"older_year": 2021,
"newer_paper": "QLoRA: Efficient Finetuning of Quantized LLMs",
"newer_year": 2023,
"newer_id": "qlora_original_id"
},
{
"id": "B008",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "loftq_vs_qlora",
"question": "How did LoftQ identify and address the initialization gap that limits QLoRA fine-tuning quality?",
"older_paper": "QLoRA: Efficient Finetuning of Quantized LLMs",
"older_year": 2023,
"newer_paper": "LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models",
"newer_year": 2023,
"newer_id": "loftq_id"
},
{
"id": "B009",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "dpo_variants",
"question": "How did hybrid rejection sampling DPO address the distribution shift problem in standard DPO?",
"older_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
"older_year": 2023,
"newer_paper": "RS-DPO: A Hybrid Rejection Sampling and Direct Preference Optimization",
"newer_year": 2024,
"newer_id": "rsdpo_id"
},
{
"id": "B010",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "rlhf_reward_hacking",
"question": "How did adversarial preference learning expose vulnerabilities in standard RLHF reward models?",
"older_paper": "Standard RLHF with fixed reward model",
"older_year": 2022,
"newer_paper": "Adversarial Preference Learning for Robust LLM Alignment",
"newer_year": 2025,
"newer_id": "adversarial_pref_id"
},
{
"id": "B011",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "chunking_strategies",
"question": "How did semantic chunking challenge the assumption that fixed-size chunking is sufficient for RAG retrieval quality?",
"older_paper": "Fixed-size chunking for RAG (early RAG implementations)",
"older_year": 2021,
"newer_paper": "Is Semantic Chunking Worth the Computational Cost?",
"newer_year": 2024,
"newer_id": "cbf04ffc43b73d315b7ba1c45866bc4eab68ebfc"
},
{
"id": "B012",
"category": "B",
"supersession_type": "hard",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "How did hybrid dense-sparse retrieval demonstrate limitations in relying solely on dense vector retrieval?",
"older_paper": "Dense Passage Retrieval (DPR)",
"older_year": 2020,
"newer_paper": "Sparse Meets Dense: A Hybrid Approach to Enhance Scientific Document Retrieval",
"newer_year": 2024,
"newer_id": "sparse_dense_hybrid_id"
},
{
"id": "B013",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "rag_vs_long_context",
"question": "How did long-context LLMs challenge the assumption that RAG is always the best approach for external knowledge integration?",
"older_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
"older_year": 2020,
"newer_paper": "LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs",
"newer_year": 2025,
"newer_id": "b8034f821c2a870d87d20f9f9227e3ffd8f81521"
},
{
"id": "B014",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "graph_rag",
"question": "How did graph-based RAG supersede flat vector retrieval for multi-hop reasoning tasks?",
"older_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
"older_year": 2020,
"newer_paper": "Graph Retrieval-Augmented Generation: A Survey",
"newer_year": 2024,
"newer_id": "graph_rag_survey_id"
},
{
"id": "B015",
"category": "B",
"supersession_type": "hard",
"subfield": "rag",
"topic": "dense_retrieval",
"question": "How did prompt-based unified dense-sparse representations challenge separate dense and sparse retrieval systems?",
"older_paper": "Separate dense (DPR) and sparse (BM25) retrieval systems",
"older_year": 2020,
"newer_paper": "PromptReps: Prompting Large Language Models to Generate Dense and Sparse Representations for Zero-Shot Document Retrieval",
"newer_year": 2024,
"newer_id": "ee8918225cc3c558b07cada34ac366a9dc081bdd"
},
{
"id": "B016",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "How did dynamic alpha tuning improve over static weight assignment in hybrid dense-sparse retrieval for RAG?",
"older_paper": "Static weight hybrid retrieval (fixed BM25 + dense weights)",
"older_year": 2022,
"newer_paper": "DAT: Dynamic Alpha Tuning for Hybrid Retrieval in Retrieval-Augmented Generation",
"newer_year": 2025,
"newer_id": "4e57d2fa070c8c9e2f36341c5f86e35b6ca33f2c"
},
{
"id": "B017",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did token importance diversity criteria supersede pure importance-only KV cache eviction policies?",
"older_paper": "H2O: Heavy-Hitter Oracle for KV Cache Eviction",
"older_year": 2023,
"newer_paper": "Mixing Importance with Diversity: Joint Optimization for KV Cache Compression in Large Vision-Language Models",
"newer_year": 2025,
"newer_id": "d31ca000a53fda36cdba549691c08052783173d8"
},
{
"id": "B018",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "rlhf_alignment",
"question": "How did direct alignment methods theoretically subsume PPO-based RLHF as a special case?",
"older_paper": "PPO-based RLHF for language model alignment",
"older_year": 2022,
"newer_paper": "From RLHF to Direct Alignment: A Theoretical Unification of Preference Learning",
"newer_year": 2026,
"newer_id": "rlhf_unification_id"
},
{
"id": "B019",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "rag_evaluation",
"question": "How did structured RAG evaluation frameworks expose the inadequacy of single-metric retrieval benchmarks?",
"older_paper": "Early RAG evaluation using only BLEU/ROUGE",
"older_year": 2021,
"newer_paper": "Evaluation of Retrieval-Augmented Generation: A Survey",
"newer_year": 2024,
"newer_id": "rag_eval_survey_id"
},
{
"id": "B020",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did autoencoder-based KV cache compression supersede simple quantization-only KV compression approaches?",
"older_paper": "KV cache quantization (INT8/INT4 KV cache)",
"older_year": 2023,
"newer_paper": "KV-CAR: KV Cache Compression using Autoencoders and KV Reuse in Large Language Models",
"newer_year": 2025,
"newer_id": "78a7d0994ab6fb4ccafdfb87d9e8fc2f171558ef"
},
{
"id": "B021",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "dpo",
"question": "How did analysis of DPO gradient imbalance reveal a previously unrecognized training instability in standard DPO?",
"older_paper": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
"older_year": 2023,
"newer_paper": "Gradient Imbalance in Direct Preference Optimization",
"newer_year": 2025,
"newer_id": "gradient_imbalance_dpo_id"
},
{
"id": "B022",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "How did batch speculative decoding with optimal token selection supersede greedy draft token selection?",
"older_paper": "Greedy draft token selection in speculative decoding",
"older_year": 2023,
"newer_paper": "TETRIS: Optimal Draft Token Selection for Batch Speculative Decoding",
"newer_year": 2025,
"newer_id": "tetris_spec_id"
},
{
"id": "B023",
"category": "B",
"supersession_type": "hard",
"subfield": "rag",
"topic": "chunking_strategies",
"question": "How did multi-overlap chunking demonstrate that non-overlapping semantic chunks miss critical context boundaries?",
"older_paper": "Single-boundary semantic chunking for RAG",
"older_year": 2023,
"newer_paper": "Mix-Of-Overlap: Enhancing Retrieval-Augmented Generation with Multiple Overlapping Chunks",
"newer_year": 2025,
"newer_id": "mix_overlap_id"
},
{
"id": "B024",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "lora",
"question": "How did layer-wise adaptive LoRA improve over uniform rank assignment across all transformer layers?",
"older_paper": "LoRA: Low-Rank Adaptation of Large Language Models",
"older_year": 2021,
"newer_paper": "La-LoRA: Parameter-efficient fine-tuning with layer-wise adaptive low-rank adaptation",
"newer_year": 2025,
"newer_id": "lalora_id"
},
{
"id": "B025",
"category": "B",
"supersession_type": "hard",
"subfield": "rag",
"topic": "retrieval_decoupling",
"question": "How did cross-attention decoupling of knowledge and context expose inefficiencies in standard RAG architectures?",
"older_paper": "Standard concatenation-based RAG architectures",
"older_year": 2022,
"newer_paper": "Decoupling Knowledge and Context: An Efficient and Effective Retrieval Augmented Generation Framework via Cross Attention",
"newer_year": 2025,
"newer_id": "d9305383cf1cbea1239f0301c06b314fee43cf4d"
},
{
"id": "B026",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did adaptive KV cache merging improve over fixed-budget token eviction for long-context tasks?",
"older_paper": "Fixed-budget KV eviction methods (H2O, SnapKV)",
"older_year": 2023,
"newer_paper": "Model Tells You Where to Merge: Adaptive KV Cache Merging for LLMs on Long-Context Tasks",
"newer_year": 2024,
"newer_id": "54f4ce7ff3390d9b8ffff90ff9be4f6e14046cd2"
},
{
"id": "B027",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "lora",
"question": "How did QR decomposition-based LoRA initialization expose the sensitivity of standard random initialization in LoRA?",
"older_paper": "LoRA: Low-Rank Adaptation of Large Language Models",
"older_year": 2021,
"newer_paper": "QR-LoRA: QR-Based Low-Rank Adaptation for Efficient Fine-Tuning",
"newer_year": 2025,
"newer_id": "qrlora_id"
},
{
"id": "B028",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "graph_rag",
"question": "How did graph-structured retrieval address the flat retrieval assumption that all documents are equally connected?",
"older_paper": "DPR and flat vector retrieval for RAG",
"older_year": 2020,
"newer_paper": "Graph Retrieval-Augmented Generation for Large Language Models: A Survey",
"newer_year": 2024,
"newer_id": "graph_rag2_survey_id"
},
{
"id": "B029",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "How did draft model alignment with chain-of-thought distillation supersede vanilla draft model training?",
"older_paper": "Standard small draft model training for speculative decoding",
"older_year": 2023,
"newer_paper": "Direct Alignment of Draft Model for Speculative Decoding with Chain-of-Thought Distillation",
"newer_year": 2024,
"newer_id": "direct_align_draft_id"
},
{
"id": "B030",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "rlhf",
"question": "How did discriminative reward modeling address the attention hacking failure mode in standard RLHF reward models?",
"older_paper": "Standard RLHF reward model training",
"older_year": 2022,
"newer_paper": "Alleviating Attention Hacking in Discriminative Reward Modeling",
"newer_year": 2025,
"newer_id": "attention_hacking_id"
},
{
"id": "B031",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "rag_evaluation",
"question": "How did RAG and long-context benchmarking reveal that neither approach dominates across all task types?",
"older_paper": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
"older_year": 2020,
"newer_paper": "LaRA: Benchmarking Retrieval-Augmented Generation and Long-Context LLMs - No Silver Bullet",
"newer_year": 2025,
"newer_id": "b8034f821c2a870d87d20f9f9227e3ffd8f81521"
},
{
"id": "B032",
"category": "B",
"supersession_type": "hard",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did theoretical analysis of compression barriers expose practical limits of KV cache reduction in autoregressive models?",
"older_paper": "Empirical KV cache compression methods (H2O, MiniCache, SnapKV)",
"older_year": 2023,
"newer_paper": "Compression Barriers for Autoregressive Transformers",
"newer_year": 2025,
"newer_id": "compression_barriers_id"
},
{
"id": "B033",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "lora_vs_full_ft",
"question": "How did empirical comparison show LoRA underperforms full fine-tuning for handwritten text recognition despite memory savings?",
"older_paper": "LoRA as universal fine-tuning replacement",
"older_year": 2021,
"newer_paper": "Low-Rank Adaptation vs. Fine-Tuning for Handwritten Text Recognition",
"newer_year": 2025,
"newer_id": "lora_vs_ft_htr_id"
},
{
"id": "B034",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "How did scalable sparse retrieval in decoder-only LLMs challenge the assumption that sparse retrieval requires encoder models?",
"older_paper": "BM25 and sparse retrieval with encoder-based models",
"older_year": 2021,
"newer_paper": "Scaling Sparse and Dense Retrieval in Decoder-Only LLMs",
"newer_year": 2025,
"newer_id": "scaling_sparse_dense_id"
},
{
"id": "B035",
"category": "B",
"supersession_type": "hard",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did lag-relative token importance scoring supersede attention-score-only KV eviction criteria?",
"older_paper": "Attention-score based KV eviction (H2O, SnapKV)",
"older_year": 2023,
"newer_paper": "LagKV: Lag-Relative Information of the KV Cache Tells Which Tokens Are Important",
"newer_year": 2025,
"newer_id": "50a2c39150d4faca53f4990ddf3d479ff570be23"
},
{
"id": "B036",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "dpo",
"question": "How did DPO for diffusion models extend and challenge the assumption that preference optimization is only applicable to autoregressive LLMs?",
"older_paper": "RLHF and DPO for autoregressive language models only",
"older_year": 2023,
"newer_paper": "Diffusion Model Alignment Using Direct Preference Optimization",
"newer_year": 2023,
"newer_id": "diffusion_dpo_id"
},
{
"id": "B037",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "chunking_strategies",
"question": "How did context reconstruction strategies for RAG expose the limitations of forward-only chunking?",
"older_paper": "Standard forward-only chunking for RAG",
"older_year": 2022,
"newer_paper": "Reconstructing Context: Evaluating Advanced Chunking Strategies for RAG",
"newer_year": 2025,
"newer_id": "reconstructing_context_id"
},
{
"id": "B038",
"category": "B",
"supersession_type": "hard",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did KV cache streaming expose the inadequacy of in-memory-only KV cache management for long-context serving?",
"older_paper": "In-memory KV cache management for LLM serving",
"older_year": 2022,
"newer_paper": "CacheGen: KV Cache Compression and Streaming for Fast Large Language Model Serving",
"newer_year": 2023,
"newer_id": "40e565e070fde823097507fd6830cfa6944df95d"
},
{
"id": "B039",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "lora",
"question": "How did AutoML-based parameter-efficient fine-tuning selection challenge manually chosen LoRA configurations?",
"older_paper": "Manually configured LoRA hyperparameters",
"older_year": 2021,
"newer_paper": "AutoAdapt: On the Application of AutoML for Parameter-Efficient Fine-Tuning",
"newer_year": 2025,
"newer_id": "autoadapt_id"
},
{
"id": "B040",
"category": "B",
"supersession_type": "hard",
"subfield": "rag",
"topic": "rag_overview",
"question": "How did graph RAG for customized LLMs demonstrate that flat document retrieval is insufficient for domain-specific expert knowledge?",
"older_paper": "Standard dense vector RAG for domain adaptation",
"older_year": 2021,
"newer_paper": "A Survey of Graph Retrieval-Augmented Generation for Customized Large Language Models",
"newer_year": 2025,
"newer_id": "908d45b0d2b88ba72ee501c368eb618d29d61ce0"
},
{
"id": "B041",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did INT8 GPU-accelerated KV cache quantization supersede FP16 KV cache storage as the deployment baseline?",
"older_paper": "FP16 KV cache storage baseline for LLM inference",
"older_year": 2022,
"newer_paper": "GPU-Accelerated INT8 Quantization for KV Cache Compression in Large Language Models",
"newer_year": 2026,
"newer_id": "gpu_int8_kv_id"
},
{
"id": "B042",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "How did domain-specific hybrid search demonstrate BM25-alone insufficiency for specialized question answering?",
"older_paper": "BM25-only retrieval for domain QA",
"older_year": 2020,
"newer_paper": "Domain-specific Question Answering with Hybrid Search",
"newer_year": 2024,
"newer_id": "da0b29c1f5d6d7fdb575023d62416e3751314a98"
},
{
"id": "B043",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "rlhf",
"question": "How did length-bias analysis reveal that high RLHF reward scores may reflect verbosity rather than quality?",
"older_paper": "Standard RLHF reward model without length normalization",
"older_year": 2022,
"newer_paper": "Bias Fitting to Mitigate Length Bias of Reward Model in RLHF",
"newer_year": 2025,
"newer_id": "length_bias_rlhf_id"
},
{
"id": "B044",
"category": "B",
"supersession_type": "soft",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "How did speculative decoding with self-verification supersede always-accept draft token strategies?",
"older_paper": "Always-accept draft token speculative decoding approaches",
"older_year": 2023,
"newer_paper": "Draft Model Knows When to Stop: Self-Verification Speculative Decoding",
"newer_year": 2024,
"newer_id": "selfverify_spec_id"
},
{
"id": "B045",
"category": "B",
"supersession_type": "hard",
"subfield": "rag",
"topic": "rag_overview",
"question": "How did decoupled cross-attention RAG demonstrate that standard prepend-and-attend context integration wastes computation?",
"older_paper": "Standard prepend-context RAG architecture",
"older_year": 2020,
"newer_paper": "Decoupling Knowledge and Context: An Efficient and Effective Retrieval Augmented Generation Framework",
"newer_year": 2025,
"newer_id": "d9305383cf1cbea1239f0301c06b314fee43cf4d"
},
{
"id": "B046",
"category": "B",
"supersession_type": "soft",
"subfield": "training_methods",
"topic": "lora",
"question": "How did SPM-LoRA's task-adaptive fine-tuning framework challenge the assumption that a single LoRA configuration generalizes across tasks?",
"older_paper": "Single-configuration LoRA for all tasks",
"older_year": 2021,
"newer_paper": "SPM-LoRA: A Novel Framework for Task-Adaptive Fine-Tuning Using Sparse Plus Mixture LoRA",
"newer_year": 2025,
"newer_id": "spm_lora_id"
},
{
"id": "B047",
"category": "B",
"supersession_type": "hard",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "How did dynamic retrieval KV cache approaches challenge static KV cache management policies?",
"older_paper": "Static KV cache management policies (fixed eviction budget)",
"older_year": 2023,
"newer_paper": "HeteroCache: A Dynamic Retrieval Approach to Heterogeneous KV Cache Management",
"newer_year": 2026,
"newer_id": "heterocache_id"
},
{
"id": "B048",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "chunking_strategies",
"question": "How did the analysis of chunking techniques across diverse document types challenge single-strategy chunking assumptions?",
"older_paper": "Single-strategy chunking (fixed-size or semantic-only)",
"older_year": 2022,
"newer_paper": "Comparison of Chunking Techniques Across Diverse Document Types in RAG",
"newer_year": 2025,
"newer_id": "chunking_comparison_id"
},
{
"id": "B049",
"category": "B",
"supersession_type": "hard",
"subfield": "training_methods",
"topic": "qlora",
"question": "How did gradient-aware LoftQ improvement expose the suboptimality of standard alternating least squares quantization in fine-tuning?",
"older_paper": "LoftQ: LoRA-Fine-Tuning-Aware Quantization for Large Language Models",
"older_year": 2023,
"newer_paper": "GA-LoftQ: Gradient-Aware Alternating Least Squares Framework for Quantized LoRA Fine-Tuning",
"newer_year": 2025,
"newer_id": "ga_loftq_id"
},
{
"id": "B050",
"category": "B",
"supersession_type": "soft",
"subfield": "rag",
"topic": "graph_rag",
"question": "How did heterogeneous document RAG expose the limitations of text-only retrieval pipelines for tables and figures?",
"older_paper": "Text-only retrieval for RAG systems",
"older_year": 2020,
"newer_paper": "TableRAG: A Retrieval Augmented Generation Framework for Heterogeneous Document Reasoning",
"newer_year": 2025,
"newer_id": "8b0606d354d1452c9893b08f991a2da0f8ea4580"
},
{
"id": "C001",
"category": "C",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "Is there consensus on whether KV cache compression meaningfully degrades reasoning ability in LLMs?",
"camps": "Camp A: KV cache compression preserves reasoning performance within acceptable bounds for most benchmarks (MiniCache, SnapKV papers). Camp B: KV cache compression causes disproportionate reasoning degradation that standard benchmarks fail to capture (Hold Onto That Thought, 2025).",
"label": "CONTESTED"
},
{
"id": "C002",
"category": "C",
"subfield": "llm_efficiency",
"topic": "quantization",
"question": "Do experts agree on whether quantized LLMs are suitable for agentic task execution?",
"camps": "Camp A: Quantization preserves sufficient capability for agentic tasks at 4-bit precision. Camp B: Compressed LLMs exhibit significant capability degradation on multi-step agentic workflows that single-task benchmarks do not reveal (Can Compressed LLMs Truly Act?, 2025).",
"label": "CONTESTED"
},
{
"id": "C003",
"category": "C",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "Is there agreement on the theoretical limits of KV cache compression for autoregressive transformers?",
"camps": "Camp A: KV cache compression ratios are primarily empirically bounded by task accuracy constraints, with no hard theoretical floor. Camp B: There exist information-theoretic compression barriers for autoregressive transformers that bound achievable compression independently of method choice (Compression Barriers for Autoregressive Transformers, 2025).",
"label": "CONTESTED"
},
{
"id": "C004",
"category": "C",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "Do researchers agree on whether speculative decoding is lossless in practice for all LLM architectures?",
"camps": "Camp A: Speculative decoding with proper acceptance criteria guarantees lossless output distribution matching the target model. Camp B: Practical implementation of lossless speculative decoding for diffusion LLMs and non-standard architectures requires additional mechanisms not present in original speculative decoding (Spiffy, 2025).",
"label": "CONTESTED"
},
{
"id": "C005",
"category": "C",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "Is there consensus on whether attention-score-based token importance is the right criterion for KV cache eviction?",
"camps": "Camp A: Attention scores are a reliable and sufficient proxy for token importance in KV eviction decisions. Camp B: Lag-relative and recency-aware token importance signals outperform pure attention scores for long-context KV eviction (LagKV, 2025).",
"label": "CONTESTED"
},
{
"id": "C006",
"category": "C",
"subfield": "training_methods",
"topic": "rlhf_vs_dpo",
"question": "Do researchers agree on whether DPO is superior to PPO for LLM alignment across all task types?",
"camps": "Camp A: DPO provides more stable and simpler training than PPO and achieves comparable or better alignment quality without a separate reward model. Camp B: PPO retains advantages over DPO on complex reasoning tasks and instruction following where on-policy exploration matters (multiple RLHF survey papers, 2024-2025).",
"label": "CONTESTED"
},
{
"id": "C007",
"category": "C",
"subfield": "training_methods",
"topic": "reward_hacking",
"question": "Is there agreement on whether current RLHF reward models are robust to adversarial preference manipulation?",
"camps": "Camp A: RLHF reward models with sufficient scale and diverse human feedback are robust to most adversarial inputs. Camp B: Standard RLHF reward models are systematically vulnerable to adversarial preference attacks that exploit reward model blind spots (Adversarial Preference Learning, 2025).",
"label": "CONTESTED"
},
{
"id": "C008",
"category": "C",
"subfield": "training_methods",
"topic": "length_bias",
"question": "Do researchers agree on the severity of length bias as a distortion in RLHF reward model training?",
"camps": "Camp A: Length bias is a known but manageable artifact in RLHF reward models that can be controlled through dataset curation. Camp B: Length bias is a fundamental and systematic distortion that causes reward models to conflate verbosity with quality, requiring explicit architectural mitigation (Bias Fitting, 2025).",
"label": "CONTESTED"
},
{
"id": "C009",
"category": "C",
"subfield": "training_methods",
"topic": "lora_vs_full_ft",
"question": "Is there consensus on whether LoRA consistently matches full fine-tuning performance across diverse tasks?",
"camps": "Camp A: LoRA achieves near-full-fine-tuning quality across most NLP tasks with dramatically fewer trainable parameters, making it the default choice. Camp B: LoRA underperforms full fine-tuning on specialized tasks such as handwritten text recognition and structured prediction where task-specific weight updates matter (Low-Rank Adaptation vs. Fine-Tuning for HTR, 2025).",
"label": "CONTESTED"
},
{
"id": "C010",
"category": "C",
"subfield": "training_methods",
"topic": "dpo_stability",
"question": "Do researchers agree on whether DPO training is stable and well-calibrated across different model scales?",
"camps": "Camp A: DPO is inherently more stable than PPO because it eliminates the separate reward model and online sampling, reducing variance. Camp B: DPO exhibits gradient imbalance and distribution shift problems that destabilize training at scale, requiring hybrid methods like RS-DPO (Gradient Imbalance in DPO, 2025; RS-DPO, 2024).",
"label": "CONTESTED"
},
{
"id": "C011",
"category": "C",
"subfield": "training_methods",
"topic": "qlora_quality",
"question": "Is there agreement on whether QLoRA's 4-bit quantization introduces acceptable quality degradation for production use?",
"camps": "Camp A: QLoRA achieves full fine-tuning quality within noise thresholds on standard benchmarks, making it production-viable. Camp B: QLoRA's quantization-induced initialization gap causes measurable downstream quality degradation that requires additional correction (LoftQ, 2023; GA-LoftQ, 2025).",
"label": "CONTESTED"
},
{
"id": "C012",
"category": "C",
"subfield": "rag",
"topic": "rag_vs_long_context",
"question": "Do researchers agree on whether RAG or long-context LLMs is the better approach for external knowledge integration?",
"camps": "Camp A: RAG is more cost-efficient and controllable for knowledge-intensive tasks, and outperforms long-context LLMs on multi-document retrieval. Camp B: Long-context LLMs with sufficient context windows can match or surpass RAG on many tasks, and there is no silver bullet — task routing is required (LaRA, 2025).",
"label": "CONTESTED"
},
{
"id": "C013",
"category": "C",
"subfield": "rag",
"topic": "semantic_chunking",
"question": "Is there consensus on whether semantic chunking provides meaningful retrieval quality improvement over fixed-size chunking?",
"camps": "Camp A: Semantic chunking consistently improves retrieval precision by preserving coherent units of meaning. Camp B: Semantic chunking's improvements are marginal and task-dependent, often not justifying the added computational overhead (Is Semantic Chunking Worth the Computational Cost?, 2024).",
"label": "CONTESTED"
},
{
"id": "C014",
"category": "C",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "Do researchers agree on the optimal balance between dense and sparse signals in hybrid retrieval for RAG?",
"camps": "Camp A: A fixed interpolation weight between BM25 and dense retrieval scores generalizes well across domains with minimal tuning. Camp B: Dynamic per-query alpha tuning is necessary for robust hybrid retrieval performance across heterogeneous document types (DAT, 2025).",
"label": "CONTESTED"
},
{
"id": "C015",
"category": "C",
"subfield": "rag",
"topic": "graph_rag",
"question": "Is there agreement on whether graph-structured retrieval is necessary for multi-hop reasoning in RAG systems?",
"camps": "Camp A: Sufficiently large dense retrieval models with re-ranking can handle multi-hop reasoning without explicit graph structure. Camp B: Graph-based RAG is necessary for multi-hop reasoning over relational knowledge because flat retrieval cannot capture cross-document entity relationships (Graph RAG Survey, 2024).",
"label": "CONTESTED"
},
{
"id": "C016",
"category": "C",
"subfield": "rag",
"topic": "retrieval_decoupling",
"question": "Do researchers agree on whether cross-attention decoupling of context and knowledge improves RAG efficiency and quality?",
"camps": "Camp A: Standard concatenation of retrieved documents with the query is sufficient and simpler to implement. Camp B: Cross-attention decoupling of knowledge and context reduces redundant computation and improves factual grounding (Decoupling Knowledge and Context, 2025).",
"label": "CONTESTED"
},
{
"id": "C017",
"category": "C",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "Is there consensus on whether KV cache merging is preferable to KV cache eviction for long-context LLM tasks?",
"camps": "Camp A: Token eviction methods are simpler and sufficient, as dropped tokens contribute marginally to final outputs. Camp B: KV cache merging preserves information that eviction permanently destroys, leading to measurably better long-context task accuracy (KVMerge, Model Tells You Where to Merge, 2024).",
"label": "CONTESTED"
},
{
"id": "C018",
"category": "C",
"subfield": "training_methods",
"topic": "rlhf",
"question": "Do researchers agree on whether RLHF reward models generalize across cultural and linguistic contexts?",
"camps": "Camp A: RLHF models trained on sufficiently diverse human feedback generalize reasonably across cultural contexts. Camp B: RLHF reward models trained predominantly on English and Western feedback exhibit systematic cultural bias that limits cross-cultural alignment (RLHF Cultural Survey, 2025).",
"label": "CONTESTED"
},
{
"id": "C019",
"category": "C",
"subfield": "rag",
"topic": "rag_evaluation",
"question": "Is there agreement on what metrics constitute a complete evaluation of RAG system quality?",
"camps": "Camp A: Faithfulness and answer relevance (as measured by Ragas) are sufficient for a complete RAG quality evaluation. Camp B: Complete RAG evaluation requires additional dimensions including retrieval precision, context utilization, and temporal relevance that Ragas does not capture (Evaluation of RAG Survey, 2024).",
"label": "CONTESTED"
},
{
"id": "C020",
"category": "C",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "Do researchers agree on whether speculative decoding provides consistent speedup across all LLM serving scenarios?",
"camps": "Camp A: Speculative decoding provides reliable 2-3x speedup across batch sizes and input types when draft and target models are well-matched. Camp B: Speculative decoding speedup degrades significantly under high-batch serving scenarios and with diverse prompts, where draft model acceptance rates drop substantially (survey papers on speculative decoding, 2024).",
"label": "CONTESTED"
},
{
"id": "C021",
"category": "C",
"subfield": "training_methods",
"topic": "lora",
"question": "Is there consensus on the optimal rank configuration for LoRA across different model sizes and task types?",
"camps": "Camp A: Low ranks (r=4 to r=16) generalize well across most tasks and model sizes, making manual rank selection straightforward. Camp B: Optimal LoRA rank is highly task- and layer-dependent, requiring adaptive rank assignment methods for peak performance (La-LoRA, AutoAdapt, 2025).",
"label": "CONTESTED"
},
{
"id": "C022",
"category": "C",
"subfield": "rag",
"topic": "chunking_strategies",
"question": "Do researchers agree on whether there is a universally optimal chunking strategy for RAG across document types?",
"camps": "Camp A: Semantic chunking with moderate overlap is the best general-purpose strategy for most document types in RAG. Camp B: No single chunking strategy dominates across diverse document types; optimal chunking is document-structure-dependent and requires task-specific configuration (Comparison of Chunking Techniques, 2025).",
"label": "CONTESTED"
},
{
"id": "C023",
"category": "C",
"subfield": "training_methods",
"topic": "rlhf_vs_dpo",
"question": "Is there agreement on whether preference alignment methods like DPO can fully replace RLHF for safety-critical alignment?",
"camps": "Camp A: DPO and its variants provide sufficient alignment guarantees for safety-critical applications while being more tractable than RLHF. Camp B: Online RL methods like PPO-based RLHF remain necessary for robust safety alignment because DPO's offline nature cannot adapt to distribution shift during deployment (alignment theory papers, 2025-2026).",
"label": "CONTESTED"
},
{
"id": "C024",
"category": "C",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "Do researchers agree on whether KV cache compression techniques transfer effectively from standard LLMs to vision-language models?",
"camps": "Camp A: KV cache compression methods developed for text-only LLMs transfer directly to vision-language models with minimal adaptation. Camp B: Vision-language models require joint token importance and diversity criteria for KV compression because visual tokens have fundamentally different importance distributions than text tokens (ZipVL, Mixing Importance with Diversity, 2024-2025).",
"label": "CONTESTED"
},
{
"id": "C025",
"category": "C",
"subfield": "rag",
"topic": "hybrid_retrieval",
"question": "Is there consensus on whether hybrid retrieval consistently outperforms pure dense retrieval in production RAG systems?",
"camps": "Camp A: Hybrid retrieval (BM25 + dense) consistently outperforms pure dense retrieval across domains due to complementary strengths of lexical and semantic matching. Camp B: The advantage of hybrid retrieval over pure dense retrieval is highly dataset-dependent, and for in-domain tasks with strong embeddings, pure dense often matches hybrid (Rethinking Hybrid Retrieval, 2025; PIRB, 2024).",
"label": "CONTESTED"
},
{
"id": "C026",
"category": "C",
"subfield": "training_methods",
"topic": "qlora",
"question": "Do researchers agree on whether QLoRA's memory savings justify its use over standard LoRA for all model sizes?",
"camps": "Camp A: QLoRA's 4-bit quantization enables fine-tuning of models that would otherwise not fit in GPU memory, with negligible quality cost. Camp B: For models that fit in memory with standard LoRA, QLoRA introduces quantization noise that outweighs its memory benefits, and LoftQ-style corrections are needed (QLoRA, 2023; LoftQ, 2023; GA-LoftQ, 2025).",
"label": "CONTESTED"
},
{
"id": "C027",
"category": "C",
"subfield": "llm_efficiency",
"topic": "speculative_decoding",
"question": "Is there agreement on whether draft model alignment improves speculative decoding acceptance rates significantly?",
"camps": "Camp A: Standard small language model draft models achieve sufficient token acceptance rates without specialized alignment training. Camp B: Explicit draft model alignment with chain-of-thought distillation or direct alignment methods significantly improves acceptance rates and end-to-end throughput (Direct Alignment of Draft Model, 2024; AdaEAGLE, 2024).",
"label": "CONTESTED"
},
{
"id": "C028",
"category": "C",
"subfield": "rag",
"topic": "graph_rag",
"question": "Do researchers agree on the computational overhead justification for graph-based RAG versus flat retrieval RAG?",
"camps": "Camp A: Graph RAG's higher computational cost is justified by substantially better multi-hop reasoning quality in complex domain tasks. Camp B: The computational overhead of graph construction and traversal in graph RAG often exceeds its quality benefits for standard question-answering workloads (Graph RAG surveys, 2024).",
"label": "CONTESTED"
},
{
"id": "C029",
"category": "C",
"subfield": "training_methods",
"topic": "rlhf",
"question": "Is there consensus on whether RLHF with human feedback scales effectively to multimodal alignment tasks?",
"camps": "Camp A: RLHF principles transfer directly to multimodal alignment with appropriate reward signal design for image-text tasks. Camp B: Multimodal RLHF requires substantially different reward modeling and feedback collection than text-only RLHF, and current methods have not solved the cross-modal reward specification problem (Preference Alignment on Diffusion Models Survey, 2025).",
"label": "CONTESTED"
},
{
"id": "C030",
"category": "C",
"subfield": "llm_efficiency",
"topic": "kv_cache_compression",
"question": "Do researchers agree on whether KV cache rethinking approaches improve over targeted eviction for long generation tasks?",
"camps": "Camp A: Targeted eviction with careful importance scoring is sufficient and simpler than wholesale KV cache architecture rethinking. Camp B: Fundamental rethinking of KV cache compression is required because targeted eviction methods hit an inherent ceiling on long-generation tasks (Rethinking Key-Value Cache Compression Techniques, 2025).",
"label": "CONTESTED"
}
]