scholar-env / scripts /generate_corpus.py
flyingmaverick's picture
Replace with ScholarEnv v0.4.0 - complete rewrite
8dde6c4
#!/usr/bin/env python3
"""
scripts/generate_corpus.py β€” Generate synthetic annotated papers for ScholarEnv.
Produces 3 papers covering:
paper_001 β€” NLP benchmark paper (easy inconsistencies, clear table refs)
paper_002 β€” Computer vision survey (medium, more tables, injected discrepancies)
paper_003 β€” Multi-task learning paper (hard, nested claims, subtle mismatches)
Each paper is a realistic synthetic document with:
- Well-structured sections (abstract, intro, methods, results, discussion, refs)
- Tables with numerical data
- Ground truth annotations for Tasks 1, 2, and 3
- Injected discrepancies (text says X, table says Y)
Run:
python scripts/generate_corpus.py
Outputs:
data/papers/paper_001.json
data/papers/paper_002.json
data/papers/paper_003.json
"""
from __future__ import annotations
import json
from pathlib import Path
OUT_DIR = Path("data/papers")
OUT_DIR.mkdir(parents=True, exist_ok=True)
# ── Paper 001: NLP Benchmark β€” LanguageNet ────────────────────────────────────
PAPER_001 = {
"id": "paper_001",
"title": "LanguageNet: A Multi-Task Pre-Training Framework for Natural Language Understanding",
"source": "arxiv:synthetic_001",
"license": "CC-BY 4.0",
"difficulty_score": 0.35,
"sections": {
"abstract": (
"We present LanguageNet, a multi-task pre-training framework for natural language "
"understanding (NLU). Our model achieves state-of-the-art results on the GLUE "
"benchmark, reaching an overall score of 94.3, outperforming prior methods by "
"2.1 points. The model was trained on 128 billion tokens using a mixture of "
"masked language modelling, next sentence prediction, and span boundary objectives. "
"We evaluate across eight downstream tasks and report consistent improvements. "
"LanguageNet demonstrates that joint training across heterogeneous NLU tasks "
"provides complementary supervision signals that improve generalisation. "
"We release model weights and training code to facilitate reproducibility."
),
"introduction": (
"Natural language understanding (NLU) encompasses a wide range of tasks including "
"sentiment analysis, textual entailment, and question answering. The GLUE benchmark "
"(Wang et al., 2018) provides a standardised evaluation suite across eight tasks. "
"Recent models such as BERT [1], RoBERTa [2], and DeBERTa [3] have pushed performance "
"significantly. In this work, we propose LanguageNet, which extends the pre-training "
"paradigm with three complementary objectives. Our main contributions are as follows: "
"(1) a novel multi-task pre-training objective combining three learning signals, "
"(2) a curriculum scheduling strategy that adapts task weights during training, "
"and (3) comprehensive ablation studies demonstrating the contribution of each component. "
"We achieve a GLUE score of 94.3, establishing a new state of the art."
),
"methods": (
"LanguageNet is built on a 340M parameter transformer architecture. We use a "
"vocabulary of 50,265 byte-pair encoding (BPE) tokens. Pre-training uses three "
"objectives simultaneously: (1) masked language modelling (MLM) with a masking "
"probability of 15%, (2) next sentence prediction (NSP), and (3) span boundary "
"objective (SBO) [4]. Training runs for 1 million steps on 128 billion tokens "
"drawn from a mixture of BookCorpus, English Wikipedia, CC-News, and OpenWebText. "
"We use the AdamW optimiser [5] with a peak learning rate of 1e-4, linear warmup "
"over 10,000 steps, and polynomial decay. Batch size is 8,192 sequences of 512 tokens. "
"Fine-tuning follows the standard protocol: we add a task-specific classification head "
"and train for 3 epochs with a learning rate of 2e-5."
),
"results": (
"Table 1 reports GLUE benchmark results. LanguageNet achieves an average score of "
"91.7 across all eight tasks, with particular strength on MNLI (90.2) and QQP (92.5). "
"On the SST-2 sentiment task, our model reaches 97.1 accuracy. "
"Table 2 presents ablation results showing the contribution of each pre-training "
"objective. Removing SBO reduces GLUE score by 1.8 points, while removing NSP "
"reduces it by 0.9 points, confirming the value of our multi-task design. "
"Training time was 14 days on 64 NVIDIA A100 GPUs. "
"Our model uses 340 million parameters, comparable to BERT-Large."
),
"discussion": (
"The results confirm that multi-task pre-training provides complementary supervision. "
"The SBO objective appears most valuable for tasks requiring span-level reasoning "
"such as SQuAD. The curriculum scheduler reduces training instability during "
"the early stages where task gradients conflict. One limitation of our approach "
"is the increased computational cost compared to single-objective pre-training. "
"Future work will explore parameter-efficient adaptation and distillation "
"to smaller model sizes."
),
"references": (
"[1] Devlin et al. (2018). BERT: Pre-training of Deep Bidirectional Transformers. NAACL.\n"
"[2] Liu et al. (2019). RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv.\n"
"[3] He et al. (2020). DeBERTa: Decoding-enhanced BERT with Disentangled Attention. ICLR.\n"
"[4] Joshi et al. (2020). SpanBERT. TACL.\n"
"[5] Loshchilov & Hutter (2019). Decoupled Weight Decay Regularization. ICLR."
),
},
"tables": {
"Table 1": {
"caption": "Table 1: GLUE benchmark results. Best results per task in bold.",
"data": {
"MNLI": "90.2",
"QQP": "92.5",
"QNLI": "95.3",
"SST-2": "97.1",
"CoLA": "67.4",
"STS-B": "91.8",
"MRPC": "89.6",
"RTE": "87.0",
"Average": "91.7",
},
},
"Table 2": {
"caption": "Table 2: Ablation study. Each row removes one pre-training objective.",
"data": {
"Full model": "91.7",
"w/o SBO": "89.9",
"w/o NSP": "90.8",
"w/o MLM": "73.2",
},
},
},
"figures": {
"Figure 1": {
"caption": "Figure 1: Training loss curves for LanguageNet.",
"type": "line_chart",
},
},
"ground_truth": {
"task1_violations": [
{"rule": "citation_format_ieee",
"note": "Uses (Author, Year) style; IEEE requires [N]"},
{"rule": "abstract_max_words",
"actual": 105, "limit": 100,
"note": "Abstract slightly over IEEE 100-word limit"},
],
"task2_inconsistencies": [
{
"id": "IC_001",
"type": "number_mismatch",
"location_a": "abstract",
"claim_a": "reaching an overall score of 94.3",
"location_b": "results",
"claim_b": "LanguageNet achieves an average score of 91.7",
"injected": False,
"note": "Abstract claims 94.3 but results section and Table 1 show 91.7",
},
{
"id": "IC_002",
"type": "contribution_count",
"location_a": "introduction",
"claim_a": "three complementary objectives",
"location_b": "methods",
"claim_b": "three objectives simultaneously",
"injected": False,
"note": "Consistent count β€” this is NOT an inconsistency (control entry)",
},
],
"task3_discrepancies": [
{
"id": "D_001",
"type": "table_text_mismatch",
"text_location": "abstract",
"text_claim": "reaching an overall score of 94.3",
"table_id": "Table 1",
"table_value": "91.7",
"injected": True,
"note": "Abstract inflated by 2.6 points vs Table 1 Average",
},
],
},
}
# ── Paper 002: Computer Vision Survey ────────────────────────────────────────
PAPER_002 = {
"id": "paper_002",
"title": "DenseVision: Efficient Dense Prediction with Hierarchical Feature Aggregation",
"source": "arxiv:synthetic_002",
"license": "CC-BY 4.0",
"difficulty_score": 0.60,
"sections": {
"abstract": (
"We introduce DenseVision, an efficient architecture for dense prediction tasks "
"including semantic segmentation and depth estimation. DenseVision employs "
"hierarchical feature aggregation across four resolution scales, achieving "
"mIoU of 56.2 on the ADE20K dataset while running at 47 frames per second "
"on a single NVIDIA RTX 3090. Compared to prior efficient methods, our model "
"reduces memory consumption by 38% while maintaining competitive accuracy. "
"We evaluate on three benchmarks: ADE20K, Cityscapes, and NYU-Depth-v2."
),
"introduction": (
"Dense prediction tasks require per-pixel understanding of scene content. "
"Semantic segmentation assigns a class label to every pixel [1], while monocular "
"depth estimation predicts a continuous depth map [2]. State-of-the-art methods "
"such as SegFormer [3] and BEiT [4] achieve high accuracy but are computationally "
"expensive. We propose DenseVision, designed for real-time inference. "
"Our contributions: "
"(1) a hierarchical feature pyramid that aggregates multi-scale context efficiently, "
"(2) a lightweight attention mechanism that reduces quadratic complexity to linear, "
"(3) a joint training protocol for segmentation and depth simultaneously, and "
"(4) comprehensive benchmarks across three standard datasets. "
"The model runs at 47 fps, meeting real-time constraints for autonomous driving."
),
"methods": (
"DenseVision consists of a MobileNetV3 backbone followed by four aggregation "
"stages at stride 4, 8, 16, and 32. Each stage produces a feature map that is "
"upsampled and summed with the preceding level. The lightweight attention module "
"decomposes the attention matrix into two low-rank factors, achieving O(n) "
"complexity. The joint loss combines cross-entropy for segmentation and "
"scale-invariant log-RMSE for depth, weighted 0.7:0.3. We train for 160,000 "
"iterations with batch size 16 on 4 A100 GPUs. Learning rate follows a poly "
"schedule from 6e-5 to 0. Data augmentation includes random horizontal flip, "
"random crop to 512Γ—512, and colour jitter."
),
"results": (
"Table 1 reports semantic segmentation results on ADE20K. DenseVision achieves "
"mIoU of 54.8 β€” competitive with SegFormer-B2 (51.8) while running 3.2Γ— faster. "
"Table 2 reports depth estimation results on NYU-Depth-v2. Our model achieves "
"delta1 accuracy of 0.921 and RMSE of 0.341. "
"Table 3 shows inference speed vs. accuracy on Cityscapes. DenseVision "
"achieves 78.3 mIoU at 47 fps. Compared to prior efficient models, "
"memory usage is reduced by 38%. The parameter count is 31.2 million, "
"substantially smaller than SegFormer-B5 (82M)."
),
"discussion": (
"DenseVision demonstrates that hierarchical aggregation without heavy attention "
"is sufficient for competitive dense prediction. The 0.7:0.3 joint loss weighting "
"was found empirically β€” adjusting to 0.5:0.5 degraded segmentation by 1.1 mIoU "
"while improving depth RMSE by 0.012. The linear attention approximation "
"introduces a small accuracy gap (0.3 mIoU) but enables real-time inference. "
"Limitations: the joint training may not generalise to all dense prediction tasks."
),
"references": (
"[1] Long et al. (2015). Fully Convolutional Networks. CVPR.\n"
"[2] Eigen et al. (2014). Depth Map Prediction. NeurIPS.\n"
"[3] Xie et al. (2021). SegFormer. NeurIPS.\n"
"[4] Bao et al. (2021). BEiT. ICLR."
),
},
"tables": {
"Table 1": {
"caption": "Table 1: Semantic segmentation on ADE20K validation set.",
"data": {
"DenseVision": {"mIoU": "54.8", "params": "31.2M", "fps": "47"},
"SegFormer-B2": {"mIoU": "51.8", "params": "25M", "fps": "15"},
"SegFormer-B5": {"mIoU": "56.1", "params": "82M", "fps": "4"},
},
},
"Table 2": {
"caption": "Table 2: Depth estimation on NYU-Depth-v2.",
"data": {
"DenseVision": {"delta1": "0.921", "RMSE": "0.341"},
"BEiT-Large": {"delta1": "0.956", "RMSE": "0.270"},
},
},
"Table 3": {
"caption": "Table 3: Speed vs. accuracy on Cityscapes.",
"data": {
"DenseVision": {"mIoU": "78.3", "fps": "47", "memory_MB": "3240"},
"DeepLabV3+": {"mIoU": "80.1", "fps": "12", "memory_MB": "5200"},
},
},
},
"figures": {
"Figure 1": {"caption": "Figure 1: DenseVision architecture.", "type": "architecture"},
"Figure 2": {"caption": "Figure 2: Qualitative segmentation results.", "type": "samples"},
},
"ground_truth": {
"task1_violations": [
{"rule": "keywords_section_present", "note": "No Keywords section"},
{"rule": "author_block_present", "note": "No author affiliation block"},
],
"task2_inconsistencies": [
{
"id": "IC_001",
"type": "number_mismatch",
"location_a": "abstract",
"claim_a": "mIoU of 56.2 on the ADE20K dataset",
"location_b": "results",
"claim_b": "DenseVision achieves mIoU of 54.8",
"injected": False,
"note": "Abstract inflated by 1.4 mIoU",
},
{
"id": "IC_002",
"type": "contribution_count",
"location_a": "introduction",
"claim_a": "four contributions listed",
"location_b": "methods",
"claim_b": "three methodological elements described",
"injected": False,
"note": "Intro promises 4 contributions, methods only implements 3",
},
],
"task3_discrepancies": [
{
"id": "D_001",
"type": "table_text_mismatch",
"text_location": "abstract",
"text_claim": "mIoU of 56.2 on the ADE20K dataset",
"table_id": "Table 1",
"table_value": "54.8",
"injected": True,
},
{
"id": "D_002",
"type": "table_text_mismatch",
"text_location": "results",
"text_claim": "DenseVision achieves mIoU of 54.8 β€” competitive with SegFormer-B2 (51.8)",
"table_id": "Table 1",
"table_value": "SegFormer-B2 mIoU=51.8",
"injected": False,
"note": "This one IS consistent β€” control entry",
},
],
},
}
# ── Paper 003: Multi-Task Learning ────────────────────────────────────────────
PAPER_003 = {
"id": "paper_003",
"title": "UnifiedLM: Scaling Multi-Task Language Models with Adaptive Gradient Balancing",
"source": "arxiv:synthetic_003",
"license": "CC-BY 4.0",
"difficulty_score": 0.80,
"sections": {
"abstract": (
"We present UnifiedLM, a large-scale multi-task language model trained on "
"23 diverse NLP tasks simultaneously. UnifiedLM-3B achieves an average improvement "
"of 4.7% over single-task baselines across all evaluated tasks. On SuperGLUE, "
"UnifiedLM scores 91.2, surpassing human performance (89.8) by 1.4 points. "
"Our adaptive gradient balancing (AGB) algorithm dynamically reweights task "
"gradients to prevent dominated tasks from collapsing. We train models at "
"three scales: 350M, 1B, and 3B parameters."
),
"introduction": (
"Multi-task learning (MTL) in NLP seeks to share representations across tasks, "
"improving data efficiency and generalisation. Classic challenges include negative "
"transfer [1] and gradient conflict between tasks [2]. We introduce UnifiedLM "
"and the AGB algorithm. Contributions: "
"(1) AGB β€” an adaptive gradient balancing algorithm that provably reduces "
"gradient conflict in MTL settings, "
"(2) a unified training protocol across 23 NLP tasks without task-specific "
"hyperparameter tuning, "
"(3) state-of-the-art SuperGLUE results at the 3B parameter scale. "
"UnifiedLM-3B achieves 91.2 on SuperGLUE."
),
"methods": (
"UnifiedLM is based on a T5 [3] encoder-decoder backbone. We train on 23 tasks "
"from the FLAN collection [4] and additional tasks from PromptSource [5]. "
"The AGB algorithm computes per-task gradient norms at each step and reweights "
"gradients to equalise their magnitudes. Specifically, for K tasks, task k "
"receives weight w_k = median_norm / ||g_k||. This prevents any single task "
"from dominating the shared parameter updates. Models are trained at three "
"scales: 350M, 1B, and 3B parameters. Training uses Adafactor with a "
"learning rate of 5e-4 for 500,000 steps."
),
"results": (
"Table 1 shows SuperGLUE results. UnifiedLM-3B achieves 91.2. "
"Table 2 compares average improvement over single-task baselines across "
"all 23 tasks. UnifiedLM-3B improves by 3.9% on average β€” a significant "
"and consistent gain across task families. "
"Table 3 ablates the AGB algorithm. Removing AGB reduces SuperGLUE score "
"by 2.3 points (from 91.2 to 88.9). Replacing AGB with PCGrad [2] gives "
"90.1, confirming AGB's superiority. "
"At the 1B scale, UnifiedLM scores 88.4 on SuperGLUE. "
"Figure 1 shows gradient conflict reduction over training steps."
),
"discussion": (
"AGB's provable gradient conflict reduction translates to consistent accuracy "
"gains. The 3B model is our strongest, but the 1B model offers a better "
"efficiency-accuracy trade-off. One limitation is the computational overhead "
"of computing per-task gradient norms at each step, adding approximately 8% "
"to training time. Future work will explore second-order AGB variants "
"and extension to vision-language tasks."
),
"references": (
"[1] Crawshaw (2020). Multi-Task Learning with Deep Neural Networks. arXiv.\n"
"[2] Yu et al. (2020). Gradient Surgery for Multi-Task Learning. NeurIPS.\n"
"[3] Raffel et al. (2020). Exploring the Limits of Transfer Learning with T5. JMLR.\n"
"[4] Wei et al. (2022). Finetuned Language Models Are Zero-Shot Learners. ICLR.\n"
"[5] Bach et al. (2022). PromptSource. ACL."
),
},
"tables": {
"Table 1": {
"caption": "Table 1: SuperGLUE benchmark results.",
"data": {
"UnifiedLM-350M": "85.3",
"UnifiedLM-1B": "88.4",
"UnifiedLM-3B": "91.2",
"Human baseline": "89.8",
"T5-11B": "90.3",
},
},
"Table 2": {
"caption": "Table 2: Average improvement over single-task baselines (23 tasks).",
"data": {
"UnifiedLM-350M": "+2.1%",
"UnifiedLM-1B": "+3.2%",
"UnifiedLM-3B": "+3.9%",
},
},
"Table 3": {
"caption": "Table 3: Ablation of gradient balancing algorithm.",
"data": {
"UnifiedLM-3B (full)": "91.2",
"w/o AGB": "88.9",
"w/ PCGrad": "90.1",
},
},
},
"figures": {
"Figure 1": {
"caption": "Figure 1: Gradient conflict (cosine similarity) over training.",
"type": "line_chart",
},
},
"ground_truth": {
"task1_violations": [
{"rule": "abstract_max_words", "actual": 118, "limit": 100},
{"rule": "citation_format_ieee", "note": "Uses [N] inline but references not IEEE formatted"},
],
"task2_inconsistencies": [
{
"id": "IC_001",
"type": "number_mismatch",
"location_a": "abstract",
"claim_a": "average improvement of 4.7% over single-task baselines",
"location_b": "results",
"claim_b": "UnifiedLM-3B improves by 3.9% on average",
"injected": False,
"note": "Abstract says 4.7% but results and Table 2 say 3.9%",
},
{
"id": "IC_002",
"type": "contribution_count",
"location_a": "introduction",
"claim_a": "three contributions listed in introduction",
"location_b": "abstract",
"claim_b": "abstract does not enumerate contributions",
"injected": False,
"note": "Intro lists 3 contributions; verify methods covers all 3",
},
],
"task3_discrepancies": [
{
"id": "D_001",
"type": "table_text_mismatch",
"text_location": "abstract",
"text_claim": "average improvement of 4.7% over single-task baselines",
"table_id": "Table 2",
"table_value": "+3.9%",
"injected": True,
"note": "Abstract says 4.7%, Table 2 shows 3.9% for UnifiedLM-3B",
},
{
"id": "D_002",
"type": "table_text_mismatch",
"text_location": "results",
"text_claim": "UnifiedLM-3B improves by 3.9% on average",
"table_id": "Table 2",
"table_value": "+3.9%",
"injected": False,
"note": "CONSISTENT β€” control; should NOT be reported",
},
],
},
}
# ── Write papers ──────────────────────────────────────────────────────────────
def main(force: bool = False) -> None:
import sys
force = force or "--force" in sys.argv
papers = [PAPER_001, PAPER_002, PAPER_003]
# Skip if all JSON files already exist β€” avoids overwriting hand-annotated GT
if not force and all((OUT_DIR / f"{p['id']}.json").exists() for p in papers):
print(f" Corpus already present in {OUT_DIR.resolve()} β€” skipping.")
print(" Pass --force to regenerate from scratch.")
return
for paper in papers:
out_path = OUT_DIR / f"{paper['id']}.json"
with out_path.open("w", encoding="utf-8") as f:
json.dump(paper, f, indent=2, ensure_ascii=False)
print(f" Written: {out_path}")
print(f"\nβœ“ {len(papers)} papers written to {OUT_DIR.resolve()}")
if __name__ == "__main__":
main()