Spaces:

Viske
/

Spjimr

Running

File size: 10,356 Bytes

a52bae4

# ============================================================================
# examples.py — built-in labeled ML paper sentences
# ============================================================================
#
# PURPOSE
# -------
# A tiny dataset of labeled sentences drawn from well-known machine learning
# papers. Used in three places in the demo:
#
#   1. As TOOLS the agent can call (search, lookup, list) — see tools.py
#   2. As a DATA SOURCE students can load as context — see app.py
#   3. As the reference vocabulary for the CLASSIFY mode — see agent.py
#
# The same dataset feeds all three, so students can ask the same question
# three different ways and compare the approaches side-by-side in the
# Results tab.
#
# SCHEMA — each entry is a dict with exactly five keys:
#   sentence     (str)  the actual text
#   paper_id     (str)  stable slug "author-year-keyword"
#   paper_title  (str)  human-readable title
#   year         (int)  publication year
#   label        (str)  one of LABELS below
# ============================================================================


# Closed vocabulary for classification. Keep this short — six labels is
# enough to be interesting and few enough that students can remember them.
LABELS = (
    "contribution",   # the paper's main claim ("we propose...")
    "method",         # how the approach works
    "result",         # a numerical or benchmark result
    "limitation",     # a weakness or failure mode the paper admits
    "motivation",     # why the problem matters
    "related_work",   # a reference to prior work
)


ML_EXAMPLES = [
    # Attention Is All You Need (Vaswani 2017)
    {
        "sentence": "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.",
        "paper_id": "vaswani-2017-attention",
        "paper_title": "Attention Is All You Need",
        "year": 2017,
        "label": "contribution",
    },
    {
        "sentence": "The Transformer follows an encoder-decoder structure using stacked self-attention and point-wise fully connected layers for both the encoder and decoder.",
        "paper_id": "vaswani-2017-attention",
        "paper_title": "Attention Is All You Need",
        "year": 2017,
        "label": "method",
    },
    {
        "sentence": "Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results by over 2 BLEU.",
        "paper_id": "vaswani-2017-attention",
        "paper_title": "Attention Is All You Need",
        "year": 2017,
        "label": "result",
    },

    # BERT (Devlin 2018)
    {
        "sentence": "BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers.",
        "paper_id": "devlin-2018-bert",
        "paper_title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
        "year": 2018,
        "label": "method",
    },
    {
        "sentence": "BERT advances the state of the art for eleven NLP tasks, pushing the GLUE score to 80.5 percent and SQuAD v1.1 F1 to 93.2.",
        "paper_id": "devlin-2018-bert",
        "paper_title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
        "year": 2018,
        "label": "result",
    },

    # GPT-3 (Brown 2020)
    {
        "sentence": "Scaling up language models greatly improves task-agnostic, few-shot performance, sometimes reaching competitiveness with prior fine-tuning approaches.",
        "paper_id": "brown-2020-gpt3",
        "paper_title": "Language Models are Few-Shot Learners",
        "year": 2020,
        "label": "contribution",
    },
    {
        "sentence": "We train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model.",
        "paper_id": "brown-2020-gpt3",
        "paper_title": "Language Models are Few-Shot Learners",
        "year": 2020,
        "label": "method",
    },
    {
        "sentence": "GPT-3 still has notable weaknesses in text synthesis and several NLP tasks, particularly those requiring reasoning over long passages.",
        "paper_id": "brown-2020-gpt3",
        "paper_title": "Language Models are Few-Shot Learners",
        "year": 2020,
        "label": "limitation",
    },

    # ResNet (He 2015)
    {
        "sentence": "Deeper neural networks are more difficult to train, and simply stacking more layers eventually degrades accuracy rather than improving it.",
        "paper_id": "he-2015-resnet",
        "paper_title": "Deep Residual Learning for Image Recognition",
        "year": 2015,
        "label": "motivation",
    },
    {
        "sentence": "We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously.",
        "paper_id": "he-2015-resnet",
        "paper_title": "Deep Residual Learning for Image Recognition",
        "year": 2015,
        "label": "contribution",
    },
    {
        "sentence": "An ensemble of these residual nets achieves 3.57 percent error on the ImageNet test set.",
        "paper_id": "he-2015-resnet",
        "paper_title": "Deep Residual Learning for Image Recognition",
        "year": 2015,
        "label": "result",
    },

    # AlphaGo (Silver 2016)
    {
        "sentence": "We introduce a new approach to computer Go using value networks to evaluate board positions and policy networks to select moves.",
        "paper_id": "silver-2016-alphago",
        "paper_title": "Mastering the game of Go with deep neural networks and tree search",
        "year": 2016,
        "label": "contribution",
    },
    {
        "sentence": "AlphaGo defeated the European champion Fan Hui by five games to zero, the first time a computer program has defeated a human professional on a full board.",
        "paper_id": "silver-2016-alphago",
        "paper_title": "Mastering the game of Go with deep neural networks and tree search",
        "year": 2016,
        "label": "result",
    },

    # CLIP (Radford 2021)
    {
        "sentence": "Learning directly from raw text about images is a promising alternative which leverages a much broader source of supervision.",
        "paper_id": "radford-2021-clip",
        "paper_title": "Learning Transferable Visual Models From Natural Language Supervision",
        "year": 2021,
        "label": "motivation",
    },
    {
        "sentence": "We demonstrate that predicting which caption goes with which image is an efficient and scalable way to learn image representations from scratch.",
        "paper_id": "radford-2021-clip",
        "paper_title": "Learning Transferable Visual Models From Natural Language Supervision",
        "year": 2021,
        "label": "method",
    },
    {
        "sentence": "CLIP matches the accuracy of the original ResNet-50 on ImageNet zero-shot without using any of the 1.28 million original labeled training examples.",
        "paper_id": "radford-2021-clip",
        "paper_title": "Learning Transferable Visual Models From Natural Language Supervision",
        "year": 2021,
        "label": "result",
    },

    # LoRA (Hu 2021)
    {
        "sentence": "Fine-tuning large pretrained models is often infeasible because it requires storing and deploying a separate set of parameters for every downstream task.",
        "paper_id": "hu-2021-lora",
        "paper_title": "LoRA: Low-Rank Adaptation of Large Language Models",
        "year": 2021,
        "label": "motivation",
    },
    {
        "sentence": "LoRA freezes pretrained model weights and injects trainable rank decomposition matrices into each Transformer layer, reducing trainable parameters by up to 10000x.",
        "paper_id": "hu-2021-lora",
        "paper_title": "LoRA: Low-Rank Adaptation of Large Language Models",
        "year": 2021,
        "label": "method",
    },

    # LLaMA (Touvron 2023)
    {
        "sentence": "We introduce LLaMA, a collection of foundation language models ranging from 7B to 65B parameters, trained on trillions of tokens using only publicly available datasets.",
        "paper_id": "touvron-2023-llama",
        "paper_title": "LLaMA: Open and Efficient Foundation Language Models",
        "year": 2023,
        "label": "contribution",
    },
    {
        "sentence": "LLaMA-13B outperforms GPT-3 on most benchmarks despite being more than 10x smaller.",
        "paper_id": "touvron-2023-llama",
        "paper_title": "LLaMA: Open and Efficient Foundation Language Models",
        "year": 2023,
        "label": "result",
    },
]


# ----------------------------------------------------------------
# Helper functions — used by tools.py and by run_classify in agent.py
# ----------------------------------------------------------------
def search_examples(query):
    """Naive case-insensitive text match across sentence and paper title."""
    q = (query or "").lower().strip()
    if not q:
        return []
    return [
        e for e in ML_EXAMPLES
        if q in e["sentence"].lower() or q in e["paper_title"].lower()
    ]


def get_paper_info(paper_id):
    """Return paper metadata (title, year, sentence count) for a given paper_id."""
    matches = [e for e in ML_EXAMPLES if e["paper_id"] == paper_id]
    if not matches:
        return None
    return {
        "paper_id": paper_id,
        "title": matches[0]["paper_title"],
        "year": matches[0]["year"],
        "sentence_count": len(matches),
    }


def list_papers():
    """Return one dict per unique paper, sorted by year."""
    papers = {}
    for e in ML_EXAMPLES:
        pid = e["paper_id"]
        if pid not in papers:
            papers[pid] = {
                "paper_id": pid,
                "title": e["paper_title"],
                "year": e["year"],
                "sentence_count": 0,
            }
        papers[pid]["sentence_count"] += 1
    return sorted(papers.values(), key=lambda p: p["year"])