#!/usr/bin/env python3 """ scripts/generate_corpus.py — Generate synthetic annotated papers for ScholarEnv. Produces 3 papers covering: paper_001 — NLP benchmark paper (easy inconsistencies, clear table refs) paper_002 — Computer vision survey (medium, more tables, injected discrepancies) paper_003 — Multi-task learning paper (hard, nested claims, subtle mismatches) Each paper is a realistic synthetic document with: - Well-structured sections (abstract, intro, methods, results, discussion, refs) - Tables with numerical data - Ground truth annotations for Tasks 1, 2, and 3 - Injected discrepancies (text says X, table says Y) Run: python scripts/generate_corpus.py Outputs: data/papers/paper_001.json data/papers/paper_002.json data/papers/paper_003.json """ from __future__ import annotations import json from pathlib import Path OUT_DIR = Path("data/papers") OUT_DIR.mkdir(parents=True, exist_ok=True) # ── Paper 001: NLP Benchmark — LanguageNet ──────────────────────────────────── PAPER_001 = { "id": "paper_001", "title": "LanguageNet: A Multi-Task Pre-Training Framework for Natural Language Understanding", "source": "arxiv:synthetic_001", "license": "CC-BY 4.0", "difficulty_score": 0.35, "sections": { "abstract": ( "We present LanguageNet, a multi-task pre-training framework for natural language " "understanding (NLU). Our model achieves state-of-the-art results on the GLUE " "benchmark, reaching an overall score of 94.3, outperforming prior methods by " "2.1 points. The model was trained on 128 billion tokens using a mixture of " "masked language modelling, next sentence prediction, and span boundary objectives. " "We evaluate across eight downstream tasks and report consistent improvements. " "LanguageNet demonstrates that joint training across heterogeneous NLU tasks " "provides complementary supervision signals that improve generalisation. " "We release model weights and training code to facilitate reproducibility." ), "introduction": ( "Natural language understanding (NLU) encompasses a wide range of tasks including " "sentiment analysis, textual entailment, and question answering. The GLUE benchmark " "(Wang et al., 2018) provides a standardised evaluation suite across eight tasks. " "Recent models such as BERT [1], RoBERTa [2], and DeBERTa [3] have pushed performance " "significantly. In this work, we propose LanguageNet, which extends the pre-training " "paradigm with three complementary objectives. Our main contributions are as follows: " "(1) a novel multi-task pre-training objective combining three learning signals, " "(2) a curriculum scheduling strategy that adapts task weights during training, " "and (3) comprehensive ablation studies demonstrating the contribution of each component. " "We achieve a GLUE score of 94.3, establishing a new state of the art." ), "methods": ( "LanguageNet is built on a 340M parameter transformer architecture. We use a " "vocabulary of 50,265 byte-pair encoding (BPE) tokens. Pre-training uses three " "objectives simultaneously: (1) masked language modelling (MLM) with a masking " "probability of 15%, (2) next sentence prediction (NSP), and (3) span boundary " "objective (SBO) [4]. Training runs for 1 million steps on 128 billion tokens " "drawn from a mixture of BookCorpus, English Wikipedia, CC-News, and OpenWebText. " "We use the AdamW optimiser [5] with a peak learning rate of 1e-4, linear warmup " "over 10,000 steps, and polynomial decay. Batch size is 8,192 sequences of 512 tokens. " "Fine-tuning follows the standard protocol: we add a task-specific classification head " "and train for 3 epochs with a learning rate of 2e-5." ), "results": ( "Table 1 reports GLUE benchmark results. LanguageNet achieves an average score of " "91.7 across all eight tasks, with particular strength on MNLI (90.2) and QQP (92.5). " "On the SST-2 sentiment task, our model reaches 97.1 accuracy. " "Table 2 presents ablation results showing the contribution of each pre-training " "objective. Removing SBO reduces GLUE score by 1.8 points, while removing NSP " "reduces it by 0.9 points, confirming the value of our multi-task design. " "Training time was 14 days on 64 NVIDIA A100 GPUs. " "Our model uses 340 million parameters, comparable to BERT-Large." ), "discussion": ( "The results confirm that multi-task pre-training provides complementary supervision. " "The SBO objective appears most valuable for tasks requiring span-level reasoning " "such as SQuAD. The curriculum scheduler reduces training instability during " "the early stages where task gradients conflict. One limitation of our approach " "is the increased computational cost compared to single-objective pre-training. " "Future work will explore parameter-efficient adaptation and distillation " "to smaller model sizes." ), "references": ( "[1] Devlin et al. (2018). BERT: Pre-training of Deep Bidirectional Transformers. NAACL.\n" "[2] Liu et al. (2019). RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv.\n" "[3] He et al. (2020). DeBERTa: Decoding-enhanced BERT with Disentangled Attention. ICLR.\n" "[4] Joshi et al. (2020). SpanBERT. TACL.\n" "[5] Loshchilov & Hutter (2019). Decoupled Weight Decay Regularization. ICLR." ), }, "tables": { "Table 1": { "caption": "Table 1: GLUE benchmark results. Best results per task in bold.", "data": { "MNLI": "90.2", "QQP": "92.5", "QNLI": "95.3", "SST-2": "97.1", "CoLA": "67.4", "STS-B": "91.8", "MRPC": "89.6", "RTE": "87.0", "Average": "91.7", }, }, "Table 2": { "caption": "Table 2: Ablation study. Each row removes one pre-training objective.", "data": { "Full model": "91.7", "w/o SBO": "89.9", "w/o NSP": "90.8", "w/o MLM": "73.2", }, }, }, "figures": { "Figure 1": { "caption": "Figure 1: Training loss curves for LanguageNet.", "type": "line_chart", }, }, "ground_truth": { "task1_violations": [ {"rule": "citation_format_ieee", "note": "Uses (Author, Year) style; IEEE requires [N]"}, {"rule": "abstract_max_words", "actual": 105, "limit": 100, "note": "Abstract slightly over IEEE 100-word limit"}, ], "task2_inconsistencies": [ { "id": "IC_001", "type": "number_mismatch", "location_a": "abstract", "claim_a": "reaching an overall score of 94.3", "location_b": "results", "claim_b": "LanguageNet achieves an average score of 91.7", "injected": False, "note": "Abstract claims 94.3 but results section and Table 1 show 91.7", }, { "id": "IC_002", "type": "contribution_count", "location_a": "introduction", "claim_a": "three complementary objectives", "location_b": "methods", "claim_b": "three objectives simultaneously", "injected": False, "note": "Consistent count — this is NOT an inconsistency (control entry)", }, ], "task3_discrepancies": [ { "id": "D_001", "type": "table_text_mismatch", "text_location": "abstract", "text_claim": "reaching an overall score of 94.3", "table_id": "Table 1", "table_value": "91.7", "injected": True, "note": "Abstract inflated by 2.6 points vs Table 1 Average", }, ], }, } # ── Paper 002: Computer Vision Survey ──────────────────────────────────────── PAPER_002 = { "id": "paper_002", "title": "DenseVision: Efficient Dense Prediction with Hierarchical Feature Aggregation", "source": "arxiv:synthetic_002", "license": "CC-BY 4.0", "difficulty_score": 0.60, "sections": { "abstract": ( "We introduce DenseVision, an efficient architecture for dense prediction tasks " "including semantic segmentation and depth estimation. DenseVision employs " "hierarchical feature aggregation across four resolution scales, achieving " "mIoU of 56.2 on the ADE20K dataset while running at 47 frames per second " "on a single NVIDIA RTX 3090. Compared to prior efficient methods, our model " "reduces memory consumption by 38% while maintaining competitive accuracy. " "We evaluate on three benchmarks: ADE20K, Cityscapes, and NYU-Depth-v2." ), "introduction": ( "Dense prediction tasks require per-pixel understanding of scene content. " "Semantic segmentation assigns a class label to every pixel [1], while monocular " "depth estimation predicts a continuous depth map [2]. State-of-the-art methods " "such as SegFormer [3] and BEiT [4] achieve high accuracy but are computationally " "expensive. We propose DenseVision, designed for real-time inference. " "Our contributions: " "(1) a hierarchical feature pyramid that aggregates multi-scale context efficiently, " "(2) a lightweight attention mechanism that reduces quadratic complexity to linear, " "(3) a joint training protocol for segmentation and depth simultaneously, and " "(4) comprehensive benchmarks across three standard datasets. " "The model runs at 47 fps, meeting real-time constraints for autonomous driving." ), "methods": ( "DenseVision consists of a MobileNetV3 backbone followed by four aggregation " "stages at stride 4, 8, 16, and 32. Each stage produces a feature map that is " "upsampled and summed with the preceding level. The lightweight attention module " "decomposes the attention matrix into two low-rank factors, achieving O(n) " "complexity. The joint loss combines cross-entropy for segmentation and " "scale-invariant log-RMSE for depth, weighted 0.7:0.3. We train for 160,000 " "iterations with batch size 16 on 4 A100 GPUs. Learning rate follows a poly " "schedule from 6e-5 to 0. Data augmentation includes random horizontal flip, " "random crop to 512×512, and colour jitter." ), "results": ( "Table 1 reports semantic segmentation results on ADE20K. DenseVision achieves " "mIoU of 54.8 — competitive with SegFormer-B2 (51.8) while running 3.2× faster. " "Table 2 reports depth estimation results on NYU-Depth-v2. Our model achieves " "delta1 accuracy of 0.921 and RMSE of 0.341. " "Table 3 shows inference speed vs. accuracy on Cityscapes. DenseVision " "achieves 78.3 mIoU at 47 fps. Compared to prior efficient models, " "memory usage is reduced by 38%. The parameter count is 31.2 million, " "substantially smaller than SegFormer-B5 (82M)." ), "discussion": ( "DenseVision demonstrates that hierarchical aggregation without heavy attention " "is sufficient for competitive dense prediction. The 0.7:0.3 joint loss weighting " "was found empirically — adjusting to 0.5:0.5 degraded segmentation by 1.1 mIoU " "while improving depth RMSE by 0.012. The linear attention approximation " "introduces a small accuracy gap (0.3 mIoU) but enables real-time inference. " "Limitations: the joint training may not generalise to all dense prediction tasks." ), "references": ( "[1] Long et al. (2015). Fully Convolutional Networks. CVPR.\n" "[2] Eigen et al. (2014). Depth Map Prediction. NeurIPS.\n" "[3] Xie et al. (2021). SegFormer. NeurIPS.\n" "[4] Bao et al. (2021). BEiT. ICLR." ), }, "tables": { "Table 1": { "caption": "Table 1: Semantic segmentation on ADE20K validation set.", "data": { "DenseVision": {"mIoU": "54.8", "params": "31.2M", "fps": "47"}, "SegFormer-B2": {"mIoU": "51.8", "params": "25M", "fps": "15"}, "SegFormer-B5": {"mIoU": "56.1", "params": "82M", "fps": "4"}, }, }, "Table 2": { "caption": "Table 2: Depth estimation on NYU-Depth-v2.", "data": { "DenseVision": {"delta1": "0.921", "RMSE": "0.341"}, "BEiT-Large": {"delta1": "0.956", "RMSE": "0.270"}, }, }, "Table 3": { "caption": "Table 3: Speed vs. accuracy on Cityscapes.", "data": { "DenseVision": {"mIoU": "78.3", "fps": "47", "memory_MB": "3240"}, "DeepLabV3+": {"mIoU": "80.1", "fps": "12", "memory_MB": "5200"}, }, }, }, "figures": { "Figure 1": {"caption": "Figure 1: DenseVision architecture.", "type": "architecture"}, "Figure 2": {"caption": "Figure 2: Qualitative segmentation results.", "type": "samples"}, }, "ground_truth": { "task1_violations": [ {"rule": "keywords_section_present", "note": "No Keywords section"}, {"rule": "author_block_present", "note": "No author affiliation block"}, ], "task2_inconsistencies": [ { "id": "IC_001", "type": "number_mismatch", "location_a": "abstract", "claim_a": "mIoU of 56.2 on the ADE20K dataset", "location_b": "results", "claim_b": "DenseVision achieves mIoU of 54.8", "injected": False, "note": "Abstract inflated by 1.4 mIoU", }, { "id": "IC_002", "type": "contribution_count", "location_a": "introduction", "claim_a": "four contributions listed", "location_b": "methods", "claim_b": "three methodological elements described", "injected": False, "note": "Intro promises 4 contributions, methods only implements 3", }, ], "task3_discrepancies": [ { "id": "D_001", "type": "table_text_mismatch", "text_location": "abstract", "text_claim": "mIoU of 56.2 on the ADE20K dataset", "table_id": "Table 1", "table_value": "54.8", "injected": True, }, { "id": "D_002", "type": "table_text_mismatch", "text_location": "results", "text_claim": "DenseVision achieves mIoU of 54.8 — competitive with SegFormer-B2 (51.8)", "table_id": "Table 1", "table_value": "SegFormer-B2 mIoU=51.8", "injected": False, "note": "This one IS consistent — control entry", }, ], }, } # ── Paper 003: Multi-Task Learning ──────────────────────────────────────────── PAPER_003 = { "id": "paper_003", "title": "UnifiedLM: Scaling Multi-Task Language Models with Adaptive Gradient Balancing", "source": "arxiv:synthetic_003", "license": "CC-BY 4.0", "difficulty_score": 0.80, "sections": { "abstract": ( "We present UnifiedLM, a large-scale multi-task language model trained on " "23 diverse NLP tasks simultaneously. UnifiedLM-3B achieves an average improvement " "of 4.7% over single-task baselines across all evaluated tasks. On SuperGLUE, " "UnifiedLM scores 91.2, surpassing human performance (89.8) by 1.4 points. " "Our adaptive gradient balancing (AGB) algorithm dynamically reweights task " "gradients to prevent dominated tasks from collapsing. We train models at " "three scales: 350M, 1B, and 3B parameters." ), "introduction": ( "Multi-task learning (MTL) in NLP seeks to share representations across tasks, " "improving data efficiency and generalisation. Classic challenges include negative " "transfer [1] and gradient conflict between tasks [2]. We introduce UnifiedLM " "and the AGB algorithm. Contributions: " "(1) AGB — an adaptive gradient balancing algorithm that provably reduces " "gradient conflict in MTL settings, " "(2) a unified training protocol across 23 NLP tasks without task-specific " "hyperparameter tuning, " "(3) state-of-the-art SuperGLUE results at the 3B parameter scale. " "UnifiedLM-3B achieves 91.2 on SuperGLUE." ), "methods": ( "UnifiedLM is based on a T5 [3] encoder-decoder backbone. We train on 23 tasks " "from the FLAN collection [4] and additional tasks from PromptSource [5]. " "The AGB algorithm computes per-task gradient norms at each step and reweights " "gradients to equalise their magnitudes. Specifically, for K tasks, task k " "receives weight w_k = median_norm / ||g_k||. This prevents any single task " "from dominating the shared parameter updates. Models are trained at three " "scales: 350M, 1B, and 3B parameters. Training uses Adafactor with a " "learning rate of 5e-4 for 500,000 steps." ), "results": ( "Table 1 shows SuperGLUE results. UnifiedLM-3B achieves 91.2. " "Table 2 compares average improvement over single-task baselines across " "all 23 tasks. UnifiedLM-3B improves by 3.9% on average — a significant " "and consistent gain across task families. " "Table 3 ablates the AGB algorithm. Removing AGB reduces SuperGLUE score " "by 2.3 points (from 91.2 to 88.9). Replacing AGB with PCGrad [2] gives " "90.1, confirming AGB's superiority. " "At the 1B scale, UnifiedLM scores 88.4 on SuperGLUE. " "Figure 1 shows gradient conflict reduction over training steps." ), "discussion": ( "AGB's provable gradient conflict reduction translates to consistent accuracy " "gains. The 3B model is our strongest, but the 1B model offers a better " "efficiency-accuracy trade-off. One limitation is the computational overhead " "of computing per-task gradient norms at each step, adding approximately 8% " "to training time. Future work will explore second-order AGB variants " "and extension to vision-language tasks." ), "references": ( "[1] Crawshaw (2020). Multi-Task Learning with Deep Neural Networks. arXiv.\n" "[2] Yu et al. (2020). Gradient Surgery for Multi-Task Learning. NeurIPS.\n" "[3] Raffel et al. (2020). Exploring the Limits of Transfer Learning with T5. JMLR.\n" "[4] Wei et al. (2022). Finetuned Language Models Are Zero-Shot Learners. ICLR.\n" "[5] Bach et al. (2022). PromptSource. ACL." ), }, "tables": { "Table 1": { "caption": "Table 1: SuperGLUE benchmark results.", "data": { "UnifiedLM-350M": "85.3", "UnifiedLM-1B": "88.4", "UnifiedLM-3B": "91.2", "Human baseline": "89.8", "T5-11B": "90.3", }, }, "Table 2": { "caption": "Table 2: Average improvement over single-task baselines (23 tasks).", "data": { "UnifiedLM-350M": "+2.1%", "UnifiedLM-1B": "+3.2%", "UnifiedLM-3B": "+3.9%", }, }, "Table 3": { "caption": "Table 3: Ablation of gradient balancing algorithm.", "data": { "UnifiedLM-3B (full)": "91.2", "w/o AGB": "88.9", "w/ PCGrad": "90.1", }, }, }, "figures": { "Figure 1": { "caption": "Figure 1: Gradient conflict (cosine similarity) over training.", "type": "line_chart", }, }, "ground_truth": { "task1_violations": [ {"rule": "abstract_max_words", "actual": 118, "limit": 100}, {"rule": "citation_format_ieee", "note": "Uses [N] inline but references not IEEE formatted"}, ], "task2_inconsistencies": [ { "id": "IC_001", "type": "number_mismatch", "location_a": "abstract", "claim_a": "average improvement of 4.7% over single-task baselines", "location_b": "results", "claim_b": "UnifiedLM-3B improves by 3.9% on average", "injected": False, "note": "Abstract says 4.7% but results and Table 2 say 3.9%", }, { "id": "IC_002", "type": "contribution_count", "location_a": "introduction", "claim_a": "three contributions listed in introduction", "location_b": "abstract", "claim_b": "abstract does not enumerate contributions", "injected": False, "note": "Intro lists 3 contributions; verify methods covers all 3", }, ], "task3_discrepancies": [ { "id": "D_001", "type": "table_text_mismatch", "text_location": "abstract", "text_claim": "average improvement of 4.7% over single-task baselines", "table_id": "Table 2", "table_value": "+3.9%", "injected": True, "note": "Abstract says 4.7%, Table 2 shows 3.9% for UnifiedLM-3B", }, { "id": "D_002", "type": "table_text_mismatch", "text_location": "results", "text_claim": "UnifiedLM-3B improves by 3.9% on average", "table_id": "Table 2", "table_value": "+3.9%", "injected": False, "note": "CONSISTENT — control; should NOT be reported", }, ], }, } # ── Write papers ────────────────────────────────────────────────────────────── def main(force: bool = False) -> None: import sys force = force or "--force" in sys.argv papers = [PAPER_001, PAPER_002, PAPER_003] # Skip if all JSON files already exist — avoids overwriting hand-annotated GT if not force and all((OUT_DIR / f"{p['id']}.json").exists() for p in papers): print(f" Corpus already present in {OUT_DIR.resolve()} — skipping.") print(" Pass --force to regenerate from scratch.") return for paper in papers: out_path = OUT_DIR / f"{paper['id']}.json" with out_path.open("w", encoding="utf-8") as f: json.dump(paper, f, indent=2, ensure_ascii=False) print(f" Written: {out_path}") print(f"\n✓ {len(papers)} papers written to {OUT_DIR.resolve()}") if __name__ == "__main__": main()