from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard Task1_Title_Search_Rate = Task("(Task1) Title Search Rate", "(Task1) Title Search Rate","(T1) Title Search Rate (%)") Task1_Precision = Task("(Task1) Precision", "(Task1) Precision","(T1) Precision (%)") Task1_Overlap = Task("(Task1) Overlap", "(Task1) Overlap","(T1) Overlap (%)") Task1_Precision_First_Author = Task("(Task1) Precision (First Author)", "(Task1) Precision (First Author)","(T1) Precision (First Author) (%)") Task1_Overlap_First_Author = Task("(Task1) Overlap (First Author)", "(Task1) Overlap (First Author)","(T1) Overlap (First Author) (%)") # Task2 Task2_Similarity = Task("(Task2) Similarity", "(Task2) Similarity", "(T2) Similarity (%)") Task2_Entail_TRUE = Task("(Task2) Entail (TRUE)", "(Task2) Entail (TRUE)", "(T2) Entail (TRUE %)") Task2_Entail_GPT_4o = Task("(Task2) Entail (GPT-4o)", "(Task2) Entail (GPT-4o)", "(T2) Entail (GPT-4o %)") Task2_ROUGE_1 = Task("(Task2) ROUGE-1", "(Task2) ROUGE-1", "(T2) ROUGE-1 (%)") Task2_ROUGE_2 = Task("(Task2) ROUGE-2", "(Task2) ROUGE-2", "(T2) ROUGE-2 (%)") Task2_ROUGE_L = Task("(Task2) ROUGE-L", "(Task2) ROUGE-L", "(T2) ROUGE-L (%)") # Task3 Task3_Precision = Task("(Task3) Precision", "(Task3) Precision", "(T3) Precision (%)") Task3_Title_Search_Rate = Task("(Task3) Title Search Rate", "(Task3) Title Search Rate", "(T3) Title Search Rate (%)") Task3_Overlap = Task("(Task3) Overlap", "(Task3) Overlap", "(T3) Overlap (%)") Task3_KPR = Task("(Task3) KPR", "(Task3) KPR", "(T3) KPR (%)") Task3_ROUGE_1 = Task("(Task3) ROUGE-1", "(Task3) ROUGE-1", "(T3) ROUGE-1 (%)") Task3_ROUGE_2 = Task("(Task3) ROUGE-2", "(Task3) ROUGE-2", "(T3) ROUGE-2 (%)") Task3_ROUGE_L = Task("(Task3) ROUGE-L", "(Task3) ROUGE-L", "(T3) ROUGE-L (%)") # Your leaderboard name TITLE = """

LLM-Based Automated Literature Review Evaluation Benchmark

""" # What does your leaderboard evaluate? # Which evaluations are you running? how can people reproduce what you have? INTRODUCTION_TEXT = """ This leaderboard evaluates Large Language Models (LLMs) on their ability to perform automated literature review tasks, including reference generation, abstract writing, and review composition.
It is based on the study: Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition.
The leaderboard measures how well different models perform in references generation, factually consistent, and stylistically appropriate academic texts.

""" # # # # # # # #
#
# Reference Generation: Precision #
#
# Abstract Writing: True #
#
# Review Composition: Precision #
#
# Literature Review Writing: KPR #
# """ EVALUATION_QUEUE_TEXT = """""" # Which evaluations are you running? How can people reproduce what you have? LLM_BENCHMARKS_TEXT = """ ## Introduction The **LLM4LitReview Leaderboard** is dedicated to evaluating the capabilities of large language models (LLMs) in automating academic writing tasks, specifically literature review generation. We focus on three subtasks: 1. **Reference Generation** – accuracy and validity of citations. 2. **Abstract Writing** – semantic coverage and factual consistency. 3. **Review Composition** – accuracy and validity of citations and semantic coverage and factual consistency. This benchmark provides a structured and reproducible framework for assessing how close LLMs are to human-level academic writing quality. --- ## Evaluation Dataset Our evaluation dataset includes: - 1015 academic papers sampled from open-access journals across multiple disciplines. - For each paper, models generate: - A reference list according to the given title and key words - An abstract summarizing according to the given title and key words - A short literature review paragraph based on the provided title, keywords, and abstract --- ## Metrics Explained - **Precision** – Precision of references that correspond to real, verifiable academic papers. - **Title_search_rate** – Whether the generated paper can be searched by title in Semantic Scholar. - **Overlap_rate** – LLM-cited vs human-cited references. - **Similarity** – Semantic similarity between model and human-generated texts (human-written and LLM-generated abstract). - **Entailment (TRUE %)** – Factual consistency between model and human-generated texts (Use NLI model TRUE as the evaluator). - **Entailment (GPT-4o%)** – Factual consistency between model and human-generated texts (Use GPT-4o as the evaluator). - **ROUGE-1, ROUGE-2, ROUGE-L** – Standard metrics for evaluating text generation quality based on n-gram Overlap with human-written texts. - **KPR (Key Point Recall)** – Measures how well key points from source documents are captured in the generated text. --- """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" @misc{tang2025largelanguagemodelsautomated, title={Large Language Models for Automated Literature Review: An Evaluation of Reference Generation, Abstract Writing, and Review Composition}, author={Xuemei Tang and Xufeng Duan and Zhenguang G. Cai}, year={2025}, eprint={2412.13612}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2412.13612}, }"""