Spaces:

kpr7
/

squad2-qa

Sleeping

App Files Files Community

Kimis Perros commited on Oct 9, 2025

Commit

461f64f

0 Parent(s):

Initial deployment

Browse files

Files changed (34) hide show

.gitattributes +36 -0
.gitignore +7 -0
README.md +19 -0
app.py +62 -0
checkpoint/config.json +10 -0
checkpoint/pytorch_model.bin +3 -0
checkpoint/tokenizer/special_tokens_map.json +7 -0
checkpoint/tokenizer/tokenizer.json +0 -0
checkpoint/tokenizer/tokenizer_config.json +56 -0
checkpoint/tokenizer/vocab.txt +0 -0
requirements.txt +5 -0
src/__init__.py +0 -0
src/config/__init__.py +0 -0
src/config/model_configs.py +91 -0
src/etl/__init__.py +0 -0
src/etl/squad_v2_loader.py +107 -0
src/etl/types.py +94 -0
src/evaluation/__init__.py +0 -0
src/evaluation/evaluator.py +64 -0
src/evaluation/inspect_scores.py +61 -0
src/evaluation/metrics.py +25 -0
src/evaluation/squad_v2_official.py +353 -0
src/models/__init__.py +0 -0
src/models/always_no_answer_model.py +35 -0
src/models/base_qa_model.py +25 -0
src/models/bert_based_model.py +639 -0
src/models/sentence_embedding_model.py +82 -0
src/pipeline/__init__.py +0 -0
src/pipeline/qa_runner.py +209 -0
src/scripts/prepare_hf_deployment.py +39 -0
src/utils/__init__.py +0 -0
src/utils/constants.py +45 -0
src/utils/experiment_snapshot.py +101 -0
src/utils/tune_threshold.py +104 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+# --- Python Cache ---
+__pycache__/
+.pytest_cache/
+*.py[cod]
+# --- OS-specific files ---
+.DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+---
+title: SQuAD 2.0 QA System
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+---
+# SQuAD 2.0 Question Answering System
+## Model Details
+- **General-Purpose Pre-Trained Model**: bert-base-uncased
+- **Training Dataset**: SQuAD 2.0 (~130K examples)
+- **Performance**: >70% F1 score on dev set
+- **Capabilities**: Handles both answerable and unanswerable questions
+## Usage
+Provide a context paragraph and ask a question to extract the answer.

app.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Question Answering System trained on SQuAD 2.0
+"""
+import gradio as gr
+import sys
+from pathlib import Path
+# Add parent directory to Python path so as to load 'src' module
+current_dir = Path(__file__).parent
+sys.path.insert(0, str(current_dir))
+from src.models.bert_based_model import BertBasedQAModel
+from src.config.model_configs import OriginalBertQAConfig
+from src.etl.types import QAExample
+model = BertBasedQAModel.load_from_experiment(
+    experiment_dir=Path("checkpoint"), config_class=OriginalBertQAConfig, device="cpu"
+)
+def answer_question(context: str, question: str) -> str:
+    """Process QA request and return answer."""
+    if not context.strip():
+        return "Please provide context text."
+    if not question.strip():
+        return "Please provide a question."
+    try:
+        example = QAExample(
+            question_id="demo",
+            title="Demo",
+            question=question.strip(),
+            context=context.strip(),
+            answer_texts=[],
+            answer_starts=[],
+            is_impossible=False,
+        )
+        predictions = model.predict({"demo": example})
+        answer = predictions["demo"].predicted_answer
+        return answer if answer else "No answer found."
+    except Exception as e:
+        return f"Error: {str(e)}"
+demo = gr.Interface(
+    fn=answer_question,
+    inputs=[
+        gr.Textbox(lines=8, placeholder="Enter context paragraph...", label="Context"),
+        gr.Textbox(placeholder="Enter your question...", label="Question"),
+    ],
+    outputs=gr.Textbox(label="Answer", show_copy_button=True),
+    title="SQuAD 2.0 Question Answering",
+    description="BERT-base model fine-tuned on SQuAD 2.0 dataset",
+    allow_flagging="never",
+)
+if __name__ == "__main__":
+    demo.launch()

checkpoint/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "backbone_name": "bert-base-uncased",
+  "max_sequence_length": 384,
+  "learning_rate": 5e-05,
+  "num_epochs": 2,
+  "batch_size": 48,
+  "eval_batch_size": 1024,
+  "no_answer_threshold": 0.0,
+  "device": "cuda"
+}

checkpoint/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:64535ce08c77f38ce4243a75daa6ac4696de0999319fb1fb6d8c6550ed18ba2a
+size 438019655

checkpoint/tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

checkpoint/tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

checkpoint/tokenizer/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch==2.8.0
+transformers==4.57.0
+gradio==4.0.0
+pandas==2.3.3
+numpy==2.2.6

src/__init__.py ADDED Viewed

File without changes

src/config/__init__.py ADDED Viewed

File without changes

src/config/model_configs.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+Immutable configurations enabling to share common fields across the specific models used.
+"""
+from abc import ABC
+from dataclasses import dataclass
+from typing import ClassVar
+@dataclass(frozen=True)
+class BaseModelConfig(ABC):
+    """
+    Container storing configurations useful across all QA models.
+    """
+@dataclass(frozen=True)
+class AlwaysNoAnswerModelConfig(BaseModelConfig):
+    """
+    Trivial baseline that always predicts no-answer ("").
+    """
+    MODEL_TYPE: ClassVar[str] = "always_no_answer"
+@dataclass(frozen=True)
+class SentenceEmbeddingModelConfig(BaseModelConfig):
+    """
+    Config object for the simpler baseline model.
+    """
+    # Ensuring that MODEL_TYPE is not treated as an object field (e.g., not added to __eq__() etc.)
+    # as it is common across all objects of the dataclass
+    MODEL_TYPE: ClassVar[str] = "embedding_best_sentence"
+    # TODO - consider switching to other defaults for non-Apple users
+    device: str = "mps"
+    sentence_model_name: str = "all-MiniLM-L6-v2"
+    no_answer_threshold: float = 0.5
+@dataclass(frozen=True)
+class BertQAConfig(BaseModelConfig, ABC):
+    """
+    Shared super-class config to be sub-classed by BERT model variants.
+    """
+    # Specifying fields to be materialized by sub-classes to avoid Pylance complaints
+    backbone_name: str
+    max_sequence_length: int
+    learning_rate: float
+    num_epochs: int
+    batch_size: int
+    eval_batch_size: int
+    no_answer_threshold: float
+    device: str = "cuda"
+@dataclass(frozen=True)
+class TinyBertQAConfig(BertQAConfig):
+    """
+    Config for a Tiny BERT-based extractive QA system.
+    """
+    MODEL_TYPE: ClassVar[str] = "tinybert_qa"
+    backbone_name: str = (
+        "huawei-noah/TinyBERT_General_4L_312D"  # General-purpose checkpoint (not QA-tuned)
+    )
+    max_sequence_length: int = 256
+    learning_rate: float = 2e-5
+    num_epochs: int = 5
+    batch_size: int = 64
+    eval_batch_size: int = 2048
+    no_answer_threshold: float = 0.0
+@dataclass(frozen=True)
+class OriginalBertQAConfig(BertQAConfig):
+    """
+    Config for a BERT-based extractive QA system (original BERT model).
+    """
+    MODEL_TYPE: ClassVar[str] = "original_bert_qa"
+    backbone_name: str = (
+        "bert-base-uncased"  # General-purpose checkpoint (not QA-tuned)
+    )
+    max_sequence_length: int = 384
+    learning_rate: float = 5e-5
+    num_epochs: int = 2
+    batch_size: int = 48
+    eval_batch_size: int = 1024
+    no_answer_threshold: float = 0.5

src/etl/__init__.py ADDED Viewed

File without changes

src/etl/squad_v2_loader.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Contains core ETL functionality to load train/dev datasets.
+"""
+from typing import Dict
+from pathlib import Path
+import json
+import pandas as pd
+from src.etl.types import QAExample
+from src.utils.constants import Col, RawField
+DEFAULT_ENCODING = "utf-8"
+def load_squad_v2_df(file_path: Path) -> pd.DataFrame:
+    """
+    Loads input SQuAD v2.0 JSON file as a Pandas DF.
+    Returns columns:
+        - Col.QUESTION_ID.value   : str (unique)
+        - Col.TITLE.value         : str
+        - Col.CONTEXT.value       : str
+        - Col.QUESTION.value      : str
+        - Col.IS_IMPOSSIBLE.value : bool
+        - Col.ANSWER_TEXTS.value  : List[str]   (all gold answers; [] if impossible)
+        - Col.ANSWER_STARTS.value : List[int]   (all start offsets; [] if impossible)
+        - Col.NUM_ANSWERS.value   : int         (len(answers))
+    """
+    assert file_path.exists(), f"File not found: {file_path}"
+    with file_path.open("r", encoding=DEFAULT_ENCODING) as f:
+        raw = json.load(f)
+    assert (
+        set(raw.keys()) == {RawField.VERSION.value, RawField.DATA.value}
+        and raw[RawField.VERSION.value] == "v2.0"
+    ), "Unexpected input data formatting."
+    rows = []
+    for article in raw[RawField.DATA.value]:
+        title = article[Col.TITLE.value]
+        for paragraph in article[RawField.PARAGRAPHS.value]:
+            context = paragraph[Col.CONTEXT.value]
+            for qa in paragraph[RawField.QAS.value]:
+                # gold answers (may be empty if unanswerable)
+                answers = qa[RawField.ANSWERS.value]
+                assert isinstance(answers, list), "Unexpected raw answers type."
+                gold_texts = [a[RawField.ANSWER_TEXT.value] for a in answers]
+                gold_starts = [a[RawField.ANSWER_START.value] for a in answers]
+                # Structural check: lengths must match
+                assert len(gold_texts) == len(
+                    gold_starts
+                ), f"Mismatched gold lengths for {qa[Col.QUESTION_ID.value]}"
+                rows.append(
+                    {
+                        Col.QUESTION_ID.value: qa[Col.QUESTION_ID.value],
+                        Col.TITLE.value: title,
+                        Col.CONTEXT.value: context,
+                        Col.QUESTION.value: qa[Col.QUESTION.value],
+                        Col.IS_IMPOSSIBLE.value: bool(qa[Col.IS_IMPOSSIBLE.value]),
+                        Col.ANSWER_TEXTS.value: gold_texts,
+                        Col.ANSWER_STARTS.value: gold_starts,
+                        Col.NUM_ANSWERS.value: len(gold_texts),
+                    }
+                )
+    df = pd.DataFrame(rows)
+    assert (
+        df[Col.QUESTION_ID.value].duplicated().sum() == 0
+    ), "Unexpected non-unique question ID."
+    return df
+def df_to_examples_map(df: pd.DataFrame) -> Dict[str, QAExample]:
+    """
+    Convert DF -> Dict[question ID, QAExample].
+    Loader already asserted uniqueness of IDs and basic structure.
+    """
+    required = {
+        Col.QUESTION_ID.value,
+        Col.TITLE.value,
+        Col.CONTEXT.value,
+        Col.QUESTION.value,
+        Col.IS_IMPOSSIBLE.value,
+        Col.ANSWER_TEXTS.value,
+        Col.ANSWER_STARTS.value,
+    }
+    missing = required - set(df.columns)
+    assert not missing, f"Missing required columns: {sorted(missing)}"
+    ex_map: Dict[str, QAExample] = {}
+    for _, row in df.iterrows():
+        qid = row[Col.QUESTION_ID.value]
+        assert qid not in ex_map, f"Duplicate id during build: {qid}"
+        ex_map[qid] = QAExample(
+            question_id=qid,
+            title=row[Col.TITLE.value],
+            question=row[Col.QUESTION.value],
+            context=row[Col.CONTEXT.value],
+            # avoid accidental shared references - create new list objects
+            answer_texts=list(row[Col.ANSWER_TEXTS.value] or []),
+            answer_starts=list(row[Col.ANSWER_STARTS.value] or []),
+            is_impossible=row[Col.IS_IMPOSSIBLE.value],
+        )
+    return ex_map

src/etl/types.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+Creates frozen dataclass objects per individual ground-truth example and individual prediction.
+Benefits:
+    - Instance immutability: avoids accidental changes to data which would be otherwise unexpected
+    - Explicit type annotation across object fields, removes ambiguity
+    - Compact implementation: reduces boilerplate code (e.g., __init__() is auto-generated)
+    - Post-init preserves consistent validation for each and every object created
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Dict
+@dataclass(frozen=True)
+class QAExample:
+    """
+    Single QA instance pulled from SQuAD (gold/ground-truth instance) as a
+    frozen dataclass to preserve immutability throughout the code's execution.
+    As per the official evaluation script, storing all possible gold answers.
+    If is_impossible is True then answer_texts and answer_starts are expected to be empty;
+    this is guaranteed during __post_init__().
+    """
+    question_id: str
+    title: str
+    question: str
+    context: str
+    answer_texts: List[str]  # empty list when is_impossible is True
+    answer_starts: List[int]  # empty list when is_impossible is True
+    is_impossible: bool
+    def __post_init__(self):
+        if not isinstance(self.is_impossible, bool):
+            raise ValueError("is_impossible field needs to be of boolean type.")
+        if len(self.answer_texts) != len(self.answer_starts):
+            raise ValueError(
+                "Incompatible sizes of answer_texts/answer_starts of QAExample."
+            )
+        if self.is_impossible:
+            if self.answer_texts or self.answer_starts:
+                raise ValueError(
+                    "Incompatible configuration between is_impossible (True) Vs answer_texts/answer_starts (non-empty) of QAExample."
+                )
+        else:
+            if not self.answer_texts or not self.answer_starts:
+                raise ValueError(
+                    "Incompatible configuration between is_impossible (False) Vs answer_texts/answer_starts (empty) of QAExample."
+                )
+@dataclass(frozen=True)
+class Prediction:
+    """
+    Single model prediction for a question.
+    __post_init__() method validates for consistency with expected values.
+    """
+    question_id: str
+    predicted_answer: str  # '' if the model predicts no-answer
+    confidence: float  # corresponds to the confidence level that the question is answerable via the context
+    is_impossible: bool
+    def __post_init__(self):
+        if not (0 <= self.confidence <= 1):
+            raise ValueError(
+                "Confidence of Prediction object should be a probability score [0, 1]."
+            )
+    @classmethod
+    def null(cls, question_id: str, confidence: float = 0.0) -> Prediction:
+        """
+        No-answer Prediction constructor to standardize it throughout the code.
+        """
+        return cls(
+            question_id=question_id,
+            predicted_answer="",
+            confidence=confidence,
+            is_impossible=True,
+        )
+    @classmethod
+    def flatten_predicted_answers(
+        cls, predictions: Dict[str, Prediction]
+    ) -> Dict[str, str]:
+        """
+        Convert Dict[qid, Prediction] -> Dict[qid, str] -
+        similar to official evaluation script style.
+        """
+        # TODO - add an extra check that each key of the Dict matches with the
+        #   question ID stored as part of the Prediction object
+        return {qid: p.predicted_answer for qid, p in predictions.items()}

src/evaluation/__init__.py ADDED Viewed

File without changes

src/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,64 @@

+"""
+Specifies the Evaluator's functionality.
+Leverages metrics as computed in the official SQuAD v2.0 evaluation
+script to ensure reporting consistency.
+"""
+from typing import Dict, List
+from src.evaluation.metrics import Metrics
+from src.etl.types import QAExample, Prediction
+from src.evaluation.squad_v2_official import (
+    normalize_answer,
+    compute_exact,
+    compute_f1,
+)
+class Evaluator:
+    def evaluate(
+        self, predictions: Dict[str, Prediction], examples: Dict[str, QAExample]
+    ) -> Metrics:
+        assert len(examples) > 0, "Examples must be non-empty."
+        assert isinstance(predictions, dict) and isinstance(
+            examples, dict
+        ), "Inputs must be dicts."
+        extras = set(predictions.keys()).symmetric_difference(set(examples.keys()))
+        assert (
+            not extras
+        ), f"Differences across predictions/examples question ids: {list(sorted(extras))[:3]} ..."
+        golds: Dict[str, List[str]] = {}
+        for qid, ex in examples.items():
+            if ex.is_impossible:
+                golds[qid] = [""]
+            else:
+                # similar to the official script - filter out golds which normalize to empty
+                filtered = [t for t in ex.answer_texts if normalize_answer(str(t))]
+                golds[qid] = filtered if filtered else [""]
+        em_sum = 0.0
+        f1_sum = 0.0
+        for qid, gold_list in golds.items():
+            pred_obj = predictions.get(qid)
+            if not pred_obj:
+                raise ValueError(
+                    "Unexpected absence of Prediction object for question ID:%s" % qid
+                )
+            pred_text = pred_obj.predicted_answer
+            assert isinstance(pred_text, str), "Unexpected predicted answer type."
+            best_em = max((compute_exact(g, pred_text) for g in gold_list), default=0)
+            best_f1 = max((compute_f1(g, pred_text) for g in gold_list), default=0.0)
+            em_sum += float(best_em)
+            f1_sum += float(best_f1)
+        total = len(golds)
+        assert total >= 1, "Unexpected empty dict of ground-truth items."
+        return Metrics(
+            exact_score=100.0 * (em_sum / total),
+            f1_score=100.0 * (f1_sum / total),
+            total_num_instances=total,
+        )

src/evaluation/inspect_scores.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+Contains supplementary routines for post-hoc validation/inspection of the results:
+    - Additional safeguard that dev set results are reliable (external recomputation of F1/EM metrics).
+    - Offers example-level inspection to users.
+"""
+import json
+import pandas as pd
+from pathlib import Path
+from src.utils.constants import Col
+from src.evaluation.squad_v2_official import normalize_answer, compute_exact, compute_f1
+def validate_experiment(exp_dir: Path, df: pd.DataFrame) -> pd.DataFrame:
+    """Load predictions, compute scores, validate against saved metrics."""
+    exp_dir = Path(exp_dir)
+    # Load and merge predictions
+    preds = json.loads((exp_dir / "predictions.json").read_text())
+    pred_series = pd.Series(preds, name="predicted_answer")
+    df_eval = df.set_index(Col.QUESTION_ID.value).join(pred_series)
+    assert df_eval["predicted_answer"].isna().sum() == 0, "Missing predictions"
+    df_eval = _compute_scores(df_eval)
+    computed_em = 100.0 * df_eval["em_score"].mean()
+    computed_f1 = 100.0 * df_eval["f1_score"].mean()
+    # Compare with saved
+    saved = json.loads((exp_dir / "metrics.json").read_text())
+    saved_em, saved_f1 = saved["exact_score"], saved["f1_score"]
+    print(f"\n{exp_dir.name}")
+    print(f"Computed: EM={computed_em:.2f}%, F1={computed_f1:.2f}%")
+    print(f"Saved:    EM={saved_em:.2f}%, F1={saved_f1:.2f}%")
+    if abs(computed_em - saved_em) < 0.01 and abs(computed_f1 - saved_f1) < 0.01:
+        print("MATCH\n")
+    else:
+        print("MISMATCH - check evaluation\n")
+    return df_eval
+def _compute_scores(df: pd.DataFrame) -> pd.DataFrame:
+    """Adds em_score and f1_score columns."""
+    scores = []
+    for _, row in df.iterrows():
+        golds = row[Col.ANSWER_TEXTS.value]
+        pred = row["predicted_answer"]
+        if not golds:
+            golds = [""]
+        else:
+            golds = [g for g in golds if normalize_answer(str(g))] or [""]
+        em = max((compute_exact(g, pred) for g in golds), default=0)
+        f1 = max((compute_f1(g, pred) for g in golds), default=0.0)
+        scores.append((em, f1))
+    df = df.copy()
+    df["em_score"] = [s[0] for s in scores]
+    df["f1_score"] = [s[1] for s in scores]
+    return df

src/evaluation/metrics.py ADDED Viewed

	@@ -0,0 +1,25 @@

+"""
+Lightweight Metrics container.
+Benefits:
+    - Facilitates addition/removal of fields without breaking callers.
+    - Better isolation of responsibilities around code exporting metrics for experiment tracking.
+"""
+from dataclasses import dataclass, asdict
+from typing import Any, Dict
+@dataclass(frozen=True)
+class Metrics:
+    # Minimal required fields (aligns with official script's main ones)
+    exact_score: float
+    f1_score: float
+    total_num_instances: int
+    def export_for_exp_tracking(self) -> Dict[str, Any]:
+        """
+        Export a dict for experiment artifacts. Skips keys that are None.
+        """
+        raw = asdict(self)
+        return {k: v for k, v in raw.items() if v is not None}

src/evaluation/squad_v2_official.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""Official evaluation script for SQuAD version 2.0.
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+TODO: Preserve only functions used in prod (i.e., metrics).
+The full file is temporaririly maintained to ensure parity between
+the official evaluation script Vs in-house prod metrics.
+"""
+import argparse
+import collections
+import json
+import numpy as np
+import os
+import re
+import string
+import sys
+OPTS = None
+def parse_args():
+    parser = argparse.ArgumentParser(
+        "Official evaluation script for SQuAD version 2.0."
+    )
+    parser.add_argument("data_file", metavar="data.json", help="Input data JSON file.")
+    parser.add_argument("pred_file", metavar="pred.json", help="Model predictions.")
+    parser.add_argument(
+        "--out-file",
+        "-o",
+        metavar="eval.json",
+        help="Write accuracy metrics to file (default is stdout).",
+    )
+    parser.add_argument(
+        "--na-prob-file",
+        "-n",
+        metavar="na_prob.json",
+        help="Model estimates of probability of no answer.",
+    )
+    parser.add_argument(
+        "--na-prob-thresh",
+        "-t",
+        type=float,
+        default=1.0,
+        help='Predict "" if no-answer probability exceeds this (default = 1.0).',
+    )
+    parser.add_argument(
+        "--out-image-dir",
+        "-p",
+        metavar="out_images",
+        default=None,
+        help="Save precision-recall curves to directory.",
+    )
+    parser.add_argument("--verbose", "-v", action="store_true")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    return parser.parse_args()
+def make_qid_to_has_ans(dataset):
+    qid_to_has_ans = {}
+    for article in dataset:
+        for p in article["paragraphs"]:
+            for qa in p["qas"]:
+                qid_to_has_ans[qa["id"]] = bool(qa["answers"])
+    return qid_to_has_ans
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+        return re.sub(regex, " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+def compute_f1(a_gold, a_pred):
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def get_raw_scores(dataset, preds):
+    exact_scores = {}
+    f1_scores = {}
+    for article in dataset:
+        for p in article["paragraphs"]:
+            for qa in p["qas"]:
+                qid = qa["id"]
+                gold_answers = [
+                    a["text"] for a in qa["answers"] if normalize_answer(a["text"])
+                ]
+                if not gold_answers:
+                    # For unanswerable questions, only correct answer is empty string
+                    gold_answers = [""]
+                if qid not in preds:
+                    print("Missing prediction for %s" % qid)
+                    continue
+                a_pred = preds[qid]
+                # Take max over all gold answers
+                exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
+                f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
+    return exact_scores, f1_scores
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores.values()) / total),
+                ("f1", 100.0 * sum(f1_scores.values()) / total),
+                ("total", total),
+            ]
+        )
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict(
+            [
+                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+                ("total", total),
+            ]
+        )
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval["%s_%s" % (prefix, k)] = new_eval[k]
+def plot_pr_curve(precisions, recalls, out_image, title):
+    plt.step(recalls, precisions, color="b", alpha=0.2, where="post")
+    plt.fill_between(recalls, precisions, step="post", alpha=0.2, color="b")
+    plt.xlabel("Recall")
+    plt.ylabel("Precision")
+    plt.xlim([0.0, 1.05])
+    plt.ylim([0.0, 1.05])
+    plt.title(title)
+    plt.savefig(out_image)
+    plt.clf()
+def make_precision_recall_eval(
+    scores, na_probs, num_true_pos, qid_to_has_ans, out_image=None, title=None
+):
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    true_pos = 0.0
+    cur_p = 1.0
+    cur_r = 0.0
+    precisions = [1.0]
+    recalls = [0.0]
+    avg_prec = 0.0
+    for i, qid in enumerate(qid_list):
+        if qid_to_has_ans[qid]:
+            true_pos += scores[qid]
+        cur_p = true_pos / float(i + 1)
+        cur_r = true_pos / float(num_true_pos)
+        if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i + 1]]:
+            # i.e., if we can put a threshold after this point
+            avg_prec += cur_p * (cur_r - recalls[-1])
+            precisions.append(cur_p)
+            recalls.append(cur_r)
+    if out_image:
+        plot_pr_curve(precisions, recalls, out_image, title)
+    return {"ap": 100.0 * avg_prec}
+def run_precision_recall_analysis(
+    main_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans, out_image_dir
+):
+    if out_image_dir and not os.path.exists(out_image_dir):
+        os.makedirs(out_image_dir)
+    num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+    if num_true_pos == 0:
+        return
+    pr_exact = make_precision_recall_eval(
+        exact_raw,
+        na_probs,
+        num_true_pos,
+        qid_to_has_ans,
+        out_image=os.path.join(out_image_dir, "pr_exact.png"),
+        title="Precision-Recall curve for Exact Match score",
+    )
+    pr_f1 = make_precision_recall_eval(
+        f1_raw,
+        na_probs,
+        num_true_pos,
+        qid_to_has_ans,
+        out_image=os.path.join(out_image_dir, "pr_f1.png"),
+        title="Precision-Recall curve for F1 score",
+    )
+    oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+    pr_oracle = make_precision_recall_eval(
+        oracle_scores,
+        na_probs,
+        num_true_pos,
+        qid_to_has_ans,
+        out_image=os.path.join(out_image_dir, "pr_oracle.png"),
+        title="Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)",
+    )
+    merge_eval(main_eval, pr_exact, "pr_exact")
+    merge_eval(main_eval, pr_f1, "pr_f1")
+    merge_eval(main_eval, pr_oracle, "pr_oracle")
+def histogram_na_prob(na_probs, qid_list, image_dir, name):
+    if not qid_list:
+        return
+    x = [na_probs[k] for k in qid_list]
+    weights = np.ones_like(x) / float(len(x))
+    plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
+    plt.xlabel("Model probability of no-answer")
+    plt.ylabel("Proportion of dataset")
+    plt.title("Histogram of no-answer probability: %s" % name)
+    plt.savefig(os.path.join(image_dir, "na_prob_hist_%s.png" % name))
+    plt.clf()
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            diff = scores[qid]
+        else:
+            if preds[qid]:
+                diff = -1
+            else:
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(
+        preds, exact_raw, na_probs, qid_to_has_ans
+    )
+    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval["best_exact"] = best_exact
+    main_eval["best_exact_thresh"] = exact_thresh
+    main_eval["best_f1"] = best_f1
+    main_eval["best_f1_thresh"] = f1_thresh
+def main():
+    with open(OPTS.data_file) as f:
+        dataset_json = json.load(f)
+        dataset = dataset_json["data"]
+    with open(OPTS.pred_file) as f:
+        preds = json.load(f)
+    if OPTS.na_prob_file:
+        with open(OPTS.na_prob_file) as f:
+            na_probs = json.load(f)
+    else:
+        na_probs = {k: 0.0 for k in preds}
+    qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(dataset, preds)
+    exact_thresh = apply_no_ans_threshold(
+        exact_raw, na_probs, qid_to_has_ans, OPTS.na_prob_thresh
+    )
+    f1_thresh = apply_no_ans_threshold(
+        f1_raw, na_probs, qid_to_has_ans, OPTS.na_prob_thresh
+    )
+    out_eval = make_eval_dict(exact_thresh, f1_thresh)
+    if has_ans_qids:
+        has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
+        merge_eval(out_eval, has_ans_eval, "HasAns")
+    if no_ans_qids:
+        no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+        merge_eval(out_eval, no_ans_eval, "NoAns")
+    if OPTS.na_prob_file:
+        find_all_best_thresh(
+            out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans
+        )
+    if OPTS.na_prob_file and OPTS.out_image_dir:
+        run_precision_recall_analysis(
+            out_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans, OPTS.out_image_dir
+        )
+        histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, "hasAns")
+        histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, "noAns")
+    if OPTS.out_file:
+        with open(OPTS.out_file, "w") as f:
+            json.dump(out_eval, f)
+    else:
+        print(json.dumps(out_eval, indent=2))
+if __name__ == "__main__":
+    OPTS = parse_args()
+    if OPTS.out_image_dir:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    main()

src/models/__init__.py ADDED Viewed

File without changes

src/models/always_no_answer_model.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""
+Always-no-answer baseline: returns a standardized null Prediction for every question.
+"""
+from typing import Dict, Optional
+from src.models.base_qa_model import QAModel
+from src.etl.types import QAExample, Prediction
+from src.config.model_configs import AlwaysNoAnswerModelConfig
+class AlwaysNoAnswerQAModel(QAModel):
+    """
+    Minimal baseline that predicts "" (no-answer) for all inputs.
+    """
+    def __init__(self, config: AlwaysNoAnswerModelConfig) -> None:
+        super().__init__()
+        assert isinstance(
+            config, AlwaysNoAnswerModelConfig
+        ), "Incompatible configuration object."
+        self.config = config
+    def train(
+        self,
+        train_examples: Optional[Dict[str, QAExample]] = None,
+        val_examples: Optional[Dict[str, QAExample]] = None,
+    ) -> None:
+        """
+        Nothing being explicitly trained for this model. Preserved for API consistency with super-class.
+        """
+        return
+    def predict(self, examples: Dict[str, QAExample]) -> Dict[str, Prediction]:
+        assert isinstance(examples, dict), "Incompatible input examples type."
+        return {qid: Prediction.null(question_id=qid) for qid in examples.keys()}

src/models/base_qa_model.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from abc import ABC, abstractmethod
+from typing import Dict, Optional
+from src.etl.types import QAExample, Prediction
+class QAModel(ABC):
+    """Basic contract dictating specific QA model implementation requirements."""
+    @abstractmethod
+    def train(
+        self,
+        train_examples: Dict[str, QAExample],
+        val_examples: Optional[Dict[str, QAExample]] = None,
+    ) -> None:
+        """
+        Trains the model; assumes uniqueness of keys of train_examples (unique question IDs).
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def predict(self, examples: Dict[str, QAExample]) -> Dict[str, Prediction]:
+        """
+        Produces one Prediction per question ID.
+        """
+        raise NotImplementedError

src/models/bert_based_model.py ADDED Viewed

	@@ -0,0 +1,639 @@

+"""
+Contains functionality adapting a general-purpose BERT-type model
+for the QA task. The BertBasedQAModel fully aligns with the structure of
+other models (i.e., sub-classing QAModel for consistency); and stores a custom
+QAModule which specifies the wiring of the general-purpose model's representations
+with the linear NN layer needed for the QA task.
+Benefits:
+    - Facilitates a **plug-and-play** selection of the underlying encoder model.
+    - Follows a clean, composition pattern, avoiding double inheritance of both
+    QAModel and torch.nn.Module which may introduce unnecessary complexity
+    (e.g., which __init__() is called, which train() is called, etc.)
+"""
+import torch
+import random
+import json
+import numpy as np
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict, Optional, List, Tuple
+from transformers import AutoTokenizer, AutoModel
+from transformers.tokenization_utils_base import BatchEncoding
+from torch.utils.data import Dataset, DataLoader
+from src.models.base_qa_model import QAModel
+from src.config.model_configs import BertQAConfig
+from src.etl.types import QAExample, Prediction
+from src.evaluation.evaluator import Evaluator, Metrics
+from src.utils.constants import DEBUG_SEED
+def set_seed(seed: int = DEBUG_SEED) -> None:
+    """
+    Set random seeds for reproducibility across Python, NumPy, and PyTorch.
+    NOTE - this is mainly to facilitate experimentation progress; options such
+    as torch.backends.cudnn.benchmark = False may hurt performance and thus running
+    this function may need to be skipped in production.
+    Relevant resources:
+        - https://stackoverflow.com/questions/67581281/does-torch-manual-seed-include-the-operation-of-torch-cuda-manual-seed-all
+        - https://docs.pytorch.org/docs/stable/notes/randomness.html
+    # TODO - move to utilities file
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    # CUDA (NVIDIA GPUs)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+    # MPS (Apple Silicon)
+    if torch.backends.mps.is_available():
+        torch.mps.manual_seed(seed)
+class QADataset(Dataset):
+    """
+    Minimal wrapper to make Dict[str, QAExample] compatible with DataLoader.
+    Facilitates batch processing during training (e.g., no manual index
+    calculations to compute batch boundaries).
+    # TODO - move to utilities file
+    """
+    def __init__(self, examples_dict: Dict[str, QAExample]):
+        """DataLoader will call __getitem__(0), __getitem__(1), etc."""
+        self.examples = list(examples_dict.values())
+    def __len__(self) -> int:
+        """Returns total number of examples. DataLoader uses this for batching."""
+        return len(self.examples)
+    def __getitem__(self, idx: int) -> QAExample:
+        """Returns a single example at the given index."""
+        return self.examples[idx]
+class BertBasedQAModel(QAModel):
+    def __init__(self, config: BertQAConfig) -> None:
+        super().__init__()
+        # Reproducible weight initialization
+        set_seed()
+        assert isinstance(config, BertQAConfig), "Incompatible configuration object."
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.backbone_name, use_fast=True
+        )
+        self.qa_module = QAModule(config=self.config)
+        # Sanity check to ensure that [CLS] token is always at position 0;
+        # This assumption is used in the code for predicting non-answerable questions
+        test_encoding = self.tokenizer("testQ", "testC", return_tensors="pt")
+        assert (
+            # [0, 0] --> [first (and only) example of batch, first sequence token for example]
+            test_encoding["input_ids"][0, 0].item()
+            == self.tokenizer.cls_token_id
+        ), "Model doesn't follow BERT's [CLS]-at-position-0 convention."
+    @classmethod
+    def load_from_experiment(
+        cls, experiment_dir: Path, config_class, device: str = "mps"
+    ):
+        """
+        Loads model from the experiment tracking directory.
+        experiment_dir: Path to the experiment (e.g., 'experiments/<date_time>_bert-base_ALL_articles')
+        device: by default we load into Apple MPS for local experimentation with predictions (e.g., threshold tuning)
+        """
+        experiment_dir = Path(experiment_dir)
+        model_dir = experiment_dir / "model"
+        if not model_dir.exists():
+            raise FileNotFoundError(f"Model directory not found: {model_dir}")
+        print(f"\nLoading model from experiment: {experiment_dir.name}")
+        with open(experiment_dir / "config.json", "r") as f:
+            config_dict = json.load(f)
+        # Override device
+        config_dict["device"] = device
+        config = config_class(**config_dict)
+        model = cls(config)
+        tokenizer_path = model_dir / "tokenizer"
+        if not tokenizer_path.exists():
+            raise FileNotFoundError(f"Tokenizer not found: {tokenizer_path}")
+        model.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
+        weights_path = model_dir / "pytorch_model.bin"
+        if not weights_path.exists():
+            raise FileNotFoundError(f"Model weights not found: {weights_path}")
+        state_dict = torch.load(weights_path, map_location=device)
+        model.qa_module.load_state_dict(state_dict)
+        model.qa_module.eval()
+        print("Model loaded succesfully and set to eval mode.")
+        return model
+    def train(
+        self,
+        train_examples: Optional[Dict[str, QAExample]] = None,
+        val_examples: Optional[Dict[str, QAExample]] = None,
+    ) -> None:
+        """
+        Trains the QA model on provided training examples.
+        """
+        # Reproducible training loop
+        set_seed()
+        # Ensuring dropout is properly configured if it is applied
+        self.qa_module.train()
+        assert train_examples is not None, "Training examples cannot be None."
+        assert len(train_examples) > 0, "Training examples cannot be empty."
+        self._print_training_setup(train_examples, val_examples, self.config)
+        # Adam is standard for BERT-type models; AdamW handles weight decay better
+        optimizer = torch.optim.AdamW(
+            self.qa_module.parameters(),  # Trains both encoder and linear head
+            lr=self.config.learning_rate,
+        )
+        # ignore_index=-1: Skip examples where answer wasn't found in tokenization;
+        # see _extract_gold_positions() for details
+        loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1)
+        dataset = QADataset(train_examples)
+        # Should shuffle to avoid bias towards certain combination of examples within a batch
+        dataloader = DataLoader(
+            dataset,
+            batch_size=self.config.batch_size,
+            shuffle=True,
+            collate_fn=lambda batch: batch,  # Return list as-is, don't collate
+        )
+        print(f"Total batches per epoch: {len(dataloader)}")
+        print(f"{'='*70}\n")
+        for epoch in range(self.config.num_epochs):
+            print(f"{'='*70}")
+            print(f"EPOCH {epoch + 1}/{self.config.num_epochs}")
+            print(f"{'='*70}")
+            total_loss = 0.0
+            # Logging/debugging: accumulate examples ignored in the loss due to answer truncation
+            set_truncated_examples = set()
+            for batch_idx, batch_examples in enumerate(dataloader):
+                # convert to the format expected by the _prepare_batch() function
+                batch_dict = {ex.question_id: ex for ex in batch_examples}
+                qids, _, _, encoded = self._prepare_batch(batch_dict)
+                assert (
+                    len(qids) == encoded["input_ids"].shape[0] == len(batch_examples)
+                ), "Training shape mismatch after batch prepare."
+                gold_starts, gold_ends = self._extract_gold_positions(
+                    batch_examples, encoded, set_truncated_examples
+                )
+                device = next(self.qa_module.parameters()).device
+                gold_starts = gold_starts.to(device)
+                gold_ends = gold_ends.to(device)
+                start_logits, end_logits = self.qa_module(
+                    input_ids=encoded["input_ids"],
+                    attention_mask=encoded.get("attention_mask"),
+                    token_type_ids=encoded.get("token_type_ids"),
+                )
+                # Shape should match (batch_size, sequence_length)
+                expected_shape = (len(batch_examples), encoded["input_ids"].shape[1])
+                assert (
+                    start_logits.shape == expected_shape
+                ), f"start_logits shape {start_logits.shape} != expected {expected_shape}"
+                assert (
+                    end_logits.shape == expected_shape
+                ), f"end_logits shape {end_logits.shape} != expected {expected_shape}"
+                start_loss = loss_fn(start_logits, gold_starts)
+                end_loss = loss_fn(end_logits, gold_ends)
+                # Similar to how the original BERT paper defines the objective for SQuAD (Section 4.2)
+                loss = (start_loss + end_loss) / 2.0
+                assert loss.dim() == 0, f"Loss should be scalar, got shape {loss.shape}"
+                # --- Standard backprop flow ---
+                # Zero out/initialize gradients from previous batch
+                optimizer.zero_grad()
+                # Backpropagate gradients
+                loss.backward()
+                # Update model parameters using computed grads
+                optimizer.step()
+                total_loss += loss.item()
+                if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == len(dataloader):
+                    avg_loss = total_loss / (batch_idx + 1)
+                    print(
+                        f"  Batch {batch_idx + 1}/{len(dataloader)} | Avg Loss: {avg_loss:.4f}"
+                    )
+            avg_epoch_loss = total_loss / len(dataloader)
+            # Currently ignored returned metrics; TODO - use them later for early stopping
+            _, _ = self._print_epoch_summary(
+                epoch=epoch + 1,
+                total_epochs=self.config.num_epochs,
+                avg_loss=avg_epoch_loss,
+                num_truncated=len(set_truncated_examples),
+                train_examples=train_examples,
+                val_examples=val_examples,
+            )
+        print("Training Completed.")
+        self.qa_module.eval()
+    def _print_epoch_summary(
+        self,
+        epoch: int,
+        total_epochs: int,
+        avg_loss: float,
+        num_truncated: int,
+        train_examples: Dict[str, QAExample],
+        val_examples: Optional[Dict[str, QAExample]] = None,
+    ) -> Tuple[Metrics, Optional[Metrics]]:
+        if num_truncated > 0:
+            print(
+                f"{num_truncated} examples truncated throughout the epoch."
+                f" Start & end answer tokens could not be identified."
+            )
+        print(f"\nEpoch {epoch}/{total_epochs} Complete | Average Loss: {avg_loss:.4f}")
+        train_metrics = self._evaluate_and_print(train_examples, "Training")
+        val_metrics = None
+        if val_examples is not None:
+            val_metrics = self._evaluate_and_print(val_examples, "Validation")
+        # Always resume training mode after evaluation
+        self.qa_module.train()
+        print(f"{'='*70}\n")
+        return train_metrics, val_metrics
+    def _evaluate_and_print(
+        self, examples: Dict[str, QAExample], split_name: str
+    ) -> Metrics:
+        print(f"Evaluating on {split_name} set...")
+        predictions = self.predict(examples)
+        metrics = Evaluator().evaluate(predictions, examples)
+        print(
+            f"{split_name} | EM: {metrics.exact_score:.2f}%, F1: {metrics.f1_score:.2f}%"
+        )
+        return metrics
+    def _extract_gold_positions(
+        self,
+        examples: List[QAExample],
+        encoded: BatchEncoding,
+        set_truncated_examples: set[str],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Maps character-level answer positions to token-level positions.
+        In particular, for each example, the function computes (all offsets are start-inclusive, end-exclusive):
+            - the answer offset within the context: [char_start, char_end)
+            - each individual token's offset within the context: [token_char_start, token_char_end)
+        For two ranges [A, B) and [C, D) to overlap:
+            1. The first range should start before the second ends (A < D)
+            2. The second range should start before the first ends (C < B)
+        These are the conditions the function utilizes to determine an answer's overlap with a specific token.
+        Finally, the function picks the FIRST and LAST tokens overlapping with the answer:
+        those tokens can fully determine the answer and align with the QA training objective.
+        Returns:
+            - gold_starts: Tensor (size: batch size) with token index for answer start
+            - gold_ends: Tensor (size: batch size) with token index for answer end
+        """
+        offsets = encoded["offset_mapping"].tolist()
+        batch_size = len(examples)
+        assert (
+            len(offsets) == batch_size
+        ), f"Offset mapping size {len(offsets)} != batch size {batch_size}"
+        # Accumulate gold positions for each example in the batch
+        gold_starts = []
+        gold_ends = []
+        for i, example in enumerate(examples):
+            # Following BERT paper (Section 4.3) - point to [CLS] token (0, 0) for unanswerables
+            if example.is_impossible:
+                gold_starts.append(0)
+                gold_ends.append(0)
+                continue
+            assert (
+                len(example.answer_starts) > 0
+            ), f"Answerable question {example.question_id} without valid answers."
+            # Simply pick the first available answer (even if multiple are provided)
+            answer_text = example.answer_texts[0]
+            char_start = example.answer_starts[0]
+            char_end = char_start + len(answer_text)
+            token_start = None  # Will store first token overlapping with the answer
+            token_end = None  # Will store last token overlapping with the answer
+            for token_idx, (token_char_start, token_char_end) in enumerate(offsets[i]):
+                # skip special tokens ([CLS], [SEP], ...)
+                if token_char_start == 0 and token_char_end == 0:
+                    continue
+                # Need first overlapping token -> check if None
+                if token_start is None and token_char_end > char_start:
+                    token_start = token_idx
+                # Need last overlapping token -> check exhaustively
+                if token_char_start < char_end:
+                    token_end = token_idx
+            if token_start is None or token_end is None:
+                # print(
+                #     f"Warning! Answer truncated for {example.question_id}, skipping in loss"
+                # )
+                set_truncated_examples.add(example.question_id)
+                # Answer was truncated -> use -1 such that it is ignored for loss computation
+                gold_starts.append(-1)
+                gold_ends.append(-1)
+                continue
+            assert (
+                token_start <= token_end
+            ), f"Invalid token span: start {token_start} > end {token_end}"
+            gold_starts.append(token_start)
+            gold_ends.append(token_end)
+        gold_starts_tensor = torch.tensor(gold_starts, dtype=torch.long)
+        gold_ends_tensor = torch.tensor(gold_ends, dtype=torch.long)
+        assert (
+            len(examples) == len(gold_starts_tensor) == len(gold_ends_tensor)
+        ), "Ground-truth token shape mismatch."
+        return gold_starts_tensor, gold_ends_tensor
+    def predict(
+        self, examples: Dict[str, QAExample], threshold_override: Optional[float] = None
+    ) -> Dict[str, Prediction]:
+        """
+        Wrapper that automatically chunks large prediction requests to avoid OOM.
+        """
+        self.qa_module.eval()
+        assert isinstance(examples, dict), "Incompatible input examples type."
+        assert len(examples) > 0, "No examples to run prediction on."
+        eval_batch_size = self.config.eval_batch_size
+        if len(examples) <= eval_batch_size:
+            return self._predict_batch(examples, threshold_override)
+        all_qids = list(examples.keys())
+        all_predictions = {}
+        # Chunking larger batches to avoid OOM errors
+        for i in range(0, len(all_qids), eval_batch_size):
+            batch_qids = all_qids[i : i + eval_batch_size]
+            batch_examples = {qid: examples[qid] for qid in batch_qids}
+            all_predictions.update(
+                self._predict_batch(batch_examples, threshold_override)
+            )
+        return all_predictions
+    def _predict_batch(
+        self, examples: Dict[str, QAExample], threshold_override: Optional[float] = None
+    ) -> Dict[str, Prediction]:
+        """
+        Processes a single batch of examples:
+            encapsulates the forward pass + logic to determine the final model's response
+            based on the predicted logits for each token being the start/end of the true answer.
+        """
+        # Offers overriding the default threshold if this is provided
+        threshold = (
+            threshold_override
+            if threshold_override is not None
+            else self.config.no_answer_threshold
+        )
+        # 1) Batch tokenization
+        qids, _, contexts, encoded = self._prepare_batch(examples)
+        # 2) Forward pass
+        # Inference mode - no gradient calculation
+        with torch.no_grad():
+            start_logits, end_logits = self.qa_module(
+                input_ids=encoded["input_ids"],
+                attention_mask=encoded.get("attention_mask"),
+                token_type_ids=encoded.get("token_type_ids"),
+            )
+        # 3) Create context mask: (batch_size, max_sequence_length) boolean tensor;
+        # Valid positions: context tokens + [CLS] (for unanswerables);
+        # Masked: question tokens, [SEP], padding
+        if encoded.get("token_type_ids") is not None:
+            # token_type_ids == 1 means context segment (Vs question segment); filter out padding tokens
+            context_mask = (encoded["token_type_ids"] == 1) & (
+                encoded["attention_mask"] == 1
+            )
+        else:
+            # Fallback for models without token_type_ids (shouldn't happen with BERT)
+            context_mask = encoded["attention_mask"] == 1
+        # Explicitly allow [CLS] token at position 0 -> predicted token for unanswerables
+        context_mask[:, 0] = True
+        context_mask = context_mask.to(self.config.device)
+        # Apply an extreme negative value to the position associated with filtered-out tokens;
+        # avoid neg-inf -> pathological cases where softmax over all neg-inf logits would result in all nans
+        MIN_NUMBER = torch.finfo(start_logits.dtype).min
+        start_logits = start_logits.masked_fill(~context_mask, MIN_NUMBER)
+        end_logits = end_logits.masked_fill(~context_mask, MIN_NUMBER)
+        # 4) Simplistic/greedy selection of tokens for start/end of the predicted response;
+        # Note that [CLS] is also available to be picked as the most probable token
+        best_start_indices = start_logits.argmax(dim=1)
+        best_end_indices = end_logits.argmax(dim=1)
+        # 5) Extract predictions from token positions
+        # offsets reveals where each token maps in the original text;
+        # example: token "apple" at token position 3 may map to text[10:15]
+        offsets = encoded["offset_mapping"].tolist()
+        predictions = {}
+        for i, qid in enumerate(qids):
+            # edge case - no valid context tokens --> return unanswerable (excluding [CLS] at position 0)
+            if not context_mask[i, 1:].any():
+                predictions[qid] = Prediction.null(question_id=qid)
+                continue
+            start_idx = best_start_indices[i].item()
+            end_idx = best_end_indices[i].item()
+            # Compute null score vs best span score (as per the BERT paper, Section 4.3)
+            null_score = start_logits[i, 0].item() + end_logits[i, 0].item()
+            best_span_score = (
+                start_logits[i, start_idx].item() + end_logits[i, end_idx].item()
+            )
+            # Predict no-answer if null score exceeds best span by threshold
+            if best_span_score <= null_score + threshold:
+                predictions[qid] = Prediction.null(question_id=qid)
+                continue
+            # NOTE: When end_idx < start_idx, the BERT paper specifies searching
+            # all valid spans to find the maximum scoring one. For efficiency and simplicity
+            # of an initial implementation, we return null. When end_idx >= start_idx, no
+            # exhaustive search is necessary (simply picking the best start/end index suffices).
+            if end_idx < start_idx:
+                predictions[qid] = Prediction.null(question_id=qid)
+                continue
+            # Map token positions -> character positions in the original text
+            start_char, _ = offsets[i][start_idx]  # Character start of first token
+            _, end_char = offsets[i][end_idx]  # Character end of last token
+            # Special tokens (such as [CLS], [SEP]) have offset [0, 0];
+            # mark as unanswerable if we selected a special token
+            if start_char == 0 and end_char == 0:
+                predictions[qid] = Prediction.null(question_id=qid)
+                continue
+            assert end_char >= start_char, (
+                f"BUG: Invalid character span [{start_char}, {end_char}] "
+                f"for valid token span [{start_idx}, {end_idx}] in question {qid}. "
+                f"This indicates a problem with offset mapping or token masking."
+            )
+            # Extract answer text from original context
+            answer_text = contexts[i][start_char:end_char].strip()
+            # reject whitespace-only responses
+            if not answer_text:
+                predictions[qid] = Prediction.null(question_id=qid)
+                continue
+            # Create final prediction
+            predictions[qid] = Prediction(
+                question_id=qid,
+                predicted_answer=answer_text,
+                confidence=1.0,  # TODO - use a better way to estimate uncertainty
+                is_impossible=False,
+            )
+        return predictions
+    def _prepare_batch(
+        self, examples: Dict[str, QAExample]
+    ) -> Tuple[List[str], List[str], List[str], BatchEncoding]:
+        """
+        Extracts questions and contexts in consistent order, then tokenizes them.
+        """
+        qids = list(examples.keys())
+        questions = [examples[qid].question for qid in qids]
+        contexts = [examples[qid].context for qid in qids]
+        encoded = self._encode_pairs(questions, contexts)
+        return qids, questions, contexts, encoded
+    def _encode_pairs(self, questions: list[str], contexts: list[str]) -> BatchEncoding:
+        """
+        Standardizes tokenization across all stages (train/inference).
+        For more information, refer to the HF documentation, for example see:
+        https://huggingface.co/docs/transformers/pad_truncation regarding sequence padding/trunctation.
+        """
+        assert len(questions) == len(
+            contexts
+        ), "Question and context lists are incompatible."
+        return self.tokenizer(
+            text=questions,
+            text_pair=contexts,
+            truncation="only_second",  # prioritizing truncating context Vs question
+            max_length=self.config.max_sequence_length,
+            padding="max_length",  # pads to uniform length for conversion to fixed-size tensors
+            return_offsets_mapping=True,  # returns (char_start, char_end) for each token
+            return_tensors="pt",
+        )
+    @staticmethod
+    def _print_training_setup(
+        train_examples: Dict[str, QAExample],
+        val_examples: Optional[Dict[str, QAExample]],
+        config: BertQAConfig,
+    ) -> None:
+        """Print training setup information including data splits and configuration."""
+        answerable_count = sum(
+            1 for ex in train_examples.values() if not ex.is_impossible
+        )
+        unanswerable_count = len(train_examples) - answerable_count
+        print(f"\n{'='*70}")
+        print(f"TRAINING SETUP")
+        print(f"{'='*70}")
+        print(f"Total examples: {len(train_examples)}")
+        print(f"  Answerable: {answerable_count}")
+        print(f"  Unanswerable: {unanswerable_count}")
+        assert len(train_examples) > 0, "No training examples!"
+        if val_examples is not None:
+            val_answerable = sum(
+                1 for ex in val_examples.values() if not ex.is_impossible
+            )
+            val_unanswerable = len(val_examples) - val_answerable
+            print(
+                f"Validation: {len(val_examples)} total ({val_answerable} answerable, {val_unanswerable} unanswerable)"
+            )
+        print(f"\nConfiguration:")
+        print(json.dumps(asdict(config), indent=2))
+        print(f"{'='*70}\n")
+class QAModule(torch.nn.Module):
+    """
+    Defines the initialization & wiring of a general-purpose encoder with a linear NN layer
+    in order to extract logits reflecting the probability of each token being
+    the start/end of the answer.
+    """
+    def __init__(self, config: BertQAConfig) -> None:
+        super().__init__()
+        assert isinstance(config, BertQAConfig), "Incompatible configuration object."
+        self.encoder = AutoModel.from_pretrained(config.backbone_name)
+        # Extracting hidden_size automatically from the encoder to support
+        # plug-and-play picking of the exact encoder type (e.g., DistilBERT, BERT, etc)
+        self.linear_head = torch.nn.Linear(
+            in_features=self.encoder.config.hidden_size, out_features=2
+        )
+        # Device placement
+        self.to(config.device)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        input_ids: tokenized integer IDs from the vocabulary
+        attention_mask: binary mask reflecting actual token Vs padding token
+        token_type_ids: binary mask reflecting the segment: sentence A Vs sentence B
+        """
+        # Ensure all inputs live on the same device as the module itself
+        dev = next(self.parameters()).device
+        input_ids = input_ids.to(dev)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(dev)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.to(dev)
+        encoder_output = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+        )
+        # Retrieve the (B, L, H) token representations of the encoder's last layer
+        encoder_output_embeddings = encoder_output.last_hidden_state
+        # Linear projection layer; tensor sizes: (B, L, H) --> (B, L, 2)
+        logits = self.linear_head(encoder_output_embeddings)
+        start_logits, end_logits = logits[:, :, 0], logits[:, :, 1]
+        return start_logits, end_logits

src/models/sentence_embedding_model.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Contains a simple baseline for the QA system.
+"""
+import spacy
+from typing import Dict, Optional, List
+from src.config.model_configs import SentenceEmbeddingModelConfig
+from src.models.base_qa_model import QAModel
+from sentence_transformers import SentenceTransformer, util
+from src.etl.types import Prediction, QAExample
+class SentenceEmbeddingQAModel(QAModel):
+    """
+    Minimal embedding-based baseline: picks the single best matching sentence from the
+    context as the response. Uses sentence-transformers (https://sbert.net/) as
+    embedding-based representations of each of the context sentences as well as
+    the question itself. The sentence associated with the highest cosine similarity score
+    against the question is returned as the response.
+    """
+    def __init__(self, config: SentenceEmbeddingModelConfig) -> None:
+        super().__init__()
+        assert isinstance(
+            config, SentenceEmbeddingModelConfig
+        ), "Incompatible configuration object."
+        self.config = config
+        self._st_model = SentenceTransformer(
+            model_name_or_path=self.config.sentence_model_name,
+            device=self.config.device,
+        )
+        self._nlp = spacy.load("en_core_web_sm")
+    def train(
+        self,
+        train_examples: Optional[Dict[str, QAExample]] = None,
+        val_examples: Optional[Dict[str, QAExample]] = None,
+    ) -> None:
+        """
+        Nothing being explicitly trained for this model. Preserved for API consistency with super-class.
+        """
+        return
+    def predict(self, examples: Dict[str, QAExample]) -> Dict[str, Prediction]:
+        assert isinstance(examples, dict), "Incompatible input examples type."
+        predictions: Dict[str, Prediction] = {}
+        for qid, example in examples.items():
+            sentences = self._split_sentences(example.context)
+            if not sentences:
+                predictions[qid] = Prediction.null(question_id=qid)
+                continue
+            q_emb = self._st_model.encode(
+                example.question, convert_to_tensor=True, normalize_embeddings=True
+            )
+            s_emb = self._st_model.encode(
+                sentences, convert_to_tensor=True, normalize_embeddings=True
+            )
+            scores = util.cos_sim(q_emb, s_emb).squeeze(0)
+            top_index = int(scores.argmax().item())
+            best_sentence = sentences[top_index]
+            best_score = float(scores[top_index])
+            if best_score < self.config.no_answer_threshold:
+                predictions[qid] = Prediction.null(question_id=qid)
+            else:
+                predictions[qid] = Prediction(
+                    question_id=qid,
+                    predicted_answer=best_sentence,
+                    confidence=best_score,
+                    is_impossible=False,
+                )
+        return predictions
+    def _split_sentences(self, text: str) -> List[str]:
+        """spacy-based sentence segmentation"""
+        text = (text or "").strip()
+        if not text:
+            return []
+        doc = self._nlp(text)
+        return [s.text.strip() for s in doc.sents if s.text.strip()]

src/pipeline/__init__.py ADDED Viewed

File without changes

src/pipeline/qa_runner.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+Contains a simple experimentation pipeline for the QA system.
+Benefits:
+    - Completely **plug-and-play**: the users can easily replace models and configs without
+    needing to change pipeline or other code.
+    - Automates experiment tracking/versioning: facilitates experimental iteration.
+    - Offers data splitting routines promoting model generalization & objective perf measuring:
+        a) initial training set gets split into: 'train' Vs 'val' subsets which DO NOT share
+        common articles, such that the 'val' set simulates actual held-out article performance;
+        b) initial 'dev' set can remain untouched until the very end for objective perf measuring
+"""
+import pandas as pd
+from typing import Tuple, Dict, Optional
+from pathlib import Path
+import sys
+from io import StringIO
+from src.utils.constants import (
+    EXPERIMENTS_DIR,
+    DEV_DATA_PATH,
+    TRAIN_DATA_PATH,
+    Col,
+    DEBUG_SEED,
+)
+from src.etl.squad_v2_loader import load_squad_v2_df, df_to_examples_map
+from src.models.base_qa_model import QAModel
+from src.etl.types import QAExample
+from src.config.model_configs import BaseModelConfig, BertQAConfig
+from src.evaluation.evaluator import Evaluator
+from src.utils.experiment_snapshot import ExperimentSnapshot
+DEFAULT_VAL_SET_FRACTION = 0.1
+class Tee:
+    """
+    Based on: https://stackoverflow.com/questions/616645/how-to-duplicate-sys-stdout-to-a-log-file
+    Duplicates output to multiple destinations such that experiment tracking can include notebook output.
+    """
+    def __init__(self, *files):
+        self.files = files
+    def write(self, obj):
+        # Writes to all of the streams
+        for f in self.files:
+            f.write(obj)
+            f.flush()
+    def flush(self):
+        # Flushes all of the streams (ensures text appears immediately)
+        for f in self.files:
+            f.flush()
+def run_qa_experiment(
+    experiment_name: str,
+    model: QAModel,
+    debug_limit: Optional[int] = None,
+    val_fraction: float = DEFAULT_VAL_SET_FRACTION,
+) -> Tuple[ExperimentSnapshot, Path, Optional[pd.DataFrame], Optional[pd.DataFrame]]:
+    """
+    Basic pipeline for running a QA system experiment.
+    To facilitate debugging:
+        1. The function can limit the #training examples processed
+        2. The sampled ETLed input DF is also provided as part of the function return
+    Note that debug_limit is only applied to the training instances; i.e., dev set is not capped.
+    """
+    # TODO - use proper logging for all of this
+    # Capture output to StringIO while printing to console
+    log_capture = StringIO()
+    original_stdout = sys.stdout
+    sys.stdout = Tee(sys.stdout, log_capture)
+    try:
+        if debug_limit is not None:
+            print(f"{debug_limit} articles will be considered from training in total.")
+        else:
+            print("All articles from training set are considered.")
+        assert TRAIN_DATA_PATH.exists(), "Unspecified train data location."
+        # Note that df_val can be returned for debugging: ignored for now
+        (train_examples, val_examples), (df_train, _) = _load_examples(
+            path=TRAIN_DATA_PATH, debug_limit=debug_limit, split_fraction=val_fraction
+        )
+        assert DEV_DATA_PATH.exists(), "Unspecified dev data location."
+        # do NOT split dev set -> split_fraction is explicitly set to None
+        (dev_examples, _), (df_dev, _) = _load_examples(
+            path=DEV_DATA_PATH, debug_limit=None, split_fraction=None
+        )
+        # Sanity checking for non-empty data splits
+        assert len(train_examples) > 0, "train_examples is empty."
+        assert len(dev_examples) > 0, "dev_examples is empty."
+        if val_examples is not None:
+            model.train(train_examples, val_examples=val_examples)
+        else:
+            model.train(train_examples)
+        predictions = model.predict(dev_examples)
+        metrics = Evaluator().evaluate(predictions=predictions, examples=dev_examples)
+        # Save experiment
+        config = getattr(model, "config", None)
+        assert isinstance(config, BaseModelConfig), "Incompatible Config type."
+        snapshot = ExperimentSnapshot(
+            experiment_name=experiment_name,
+            config=config,
+            predictions=predictions,
+            metrics=metrics,
+            model=model,
+        )
+        print("\n" + "=" * 70)
+        print("FINAL DEV SET RESULTS")
+        print("=" * 70)
+        print(f"Exact Match (EM): {snapshot.metrics.exact_score:.2f}%")
+        print(f"F1 Score: {snapshot.metrics.f1_score:.2f}%")
+        print(f"Total dev examples: {snapshot.metrics.total_num_instances}")
+        print("=" * 70)
+        run_dir = snapshot.save(experiments_root=EXPERIMENTS_DIR)
+        (run_dir / "training_log.txt").write_text(
+            log_capture.getvalue(), encoding="utf-8"
+        )
+        return snapshot, run_dir, df_train, df_dev
+    finally:
+        # Restore stdout after running the experiment
+        sys.stdout = original_stdout
+def create_experiment_name(
+    model_name_short: str, config: BertQAConfig, num_articles: Optional[int] = None
+) -> str:
+    assert (
+        model_name_short in config.backbone_name
+    ), "Inconsistent model name used for experiment tracking Vs actual model name."
+    experiment_name = (
+        f"{model_name_short}_{num_articles}_articles"
+        if num_articles is not None
+        else f"{model_name_short}_ALL_articles"
+    )
+    return experiment_name
+def _load_examples(
+    path: Path, debug_limit: int | None, split_fraction: float | None
+) -> Tuple[
+    Tuple[Dict[str, QAExample], Dict[str, QAExample] | None],
+    Tuple[pd.DataFrame, pd.DataFrame | None],
+]:
+    """
+    Returns both a dict with QAExample objects and the associated DF for debugging.
+    Both the debug_limit and the split_fraction are operating on the ARTICLE level Vs
+    individual example/question level.
+        - debug_limit: caps the #articles returned for debugging/easier experimentation
+        - split_fraction: enables train/val splitting based on initial training data
+    """
+    df = load_squad_v2_df(path)
+    if debug_limit is not None:
+        all_titles = df[Col.TITLE.value].unique()
+        assert (
+            1 <= debug_limit <= len(all_titles)
+        ), f"debug_limit={debug_limit} exceeds {len(all_titles)} available articles"
+        # df = df.sample(n=debug_limit, random_state=DEBUG_SEED).copy()
+        sampled_titles = pd.Series(all_titles).sample(
+            n=debug_limit, random_state=DEBUG_SEED
+        )
+        df = df[df[Col.TITLE.value].isin(sampled_titles)].copy()
+    if split_fraction is not None:
+        df_train, df_val = split_by_title(df, split_fraction)
+        train_examples = df_to_examples_map(df_train)
+        val_examples = df_to_examples_map(df_val)
+        return (train_examples, val_examples), (df_train, df_val)
+    else:
+        examples = df_to_examples_map(df)
+        return (examples, None), (df, None)
+def split_by_title(
+    df: pd.DataFrame, val_fraction: float
+) -> Tuple[pd.DataFrame, pd.DataFrame]:
+    """
+    Split input DF by article title ensuring no title overlap:
+    this is critical for model generalization to new contexts Vs
+    simply memorizing passages and responding to new questions about them
+    (e.g., when splitting the initial training set into 'train' and 'val' subsets).
+    """
+    assert 0 < val_fraction < 1, "val set fraction should be between (0, 1)."
+    unique_titles = df[Col.TITLE.value].drop_duplicates()
+    shuffled_titles = unique_titles.sample(frac=1.0, random_state=DEBUG_SEED)
+    num_unique_titles = len(shuffled_titles)
+    n_val = max(1, int(num_unique_titles * val_fraction))
+    val_titles = set(shuffled_titles.iloc[:n_val])
+    train_titles = set(shuffled_titles.iloc[n_val:])
+    df_val = df[df[Col.TITLE.value].isin(val_titles)].copy()
+    df_train = df[df[Col.TITLE.value].isin(train_titles)].copy()
+    print(
+        f"Initial split | num-train-examples: {df_train.shape[0]}; num-val-examples: {df_val.shape[0]}"
+    )
+    return df_train, df_val

src/scripts/prepare_hf_deployment.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import shutil
+from pathlib import Path
+import sys
+# Experiment directory to be used for deployment is passed in as input argument
+exp_dir = Path(sys.argv[1])
+deploy_dir = Path("hf_deployment")
+# Safety-first: enables first seeing the changes before actually transfering files over (AWS s3 operations-like)
+dry_run = "--dry-run" in sys.argv
+checkpoint = deploy_dir / "checkpoint"
+prefix = "[DRY RUN]" if dry_run else ""
+print(f"{prefix} Create: {checkpoint}")
+if not dry_run:
+    checkpoint.mkdir(parents=True, exist_ok=True)
+# Individual files
+files = [
+    (exp_dir / "config.json", checkpoint / "config.json"),
+    (exp_dir / "model/pytorch_model.bin", checkpoint / "pytorch_model.bin"),
+]
+for src, dst in files:
+    print(f"{prefix} Copy: {src} -> {dst}")
+    if not dry_run:
+        shutil.copy2(src, dst)
+# Directories (recursively)
+trees = [
+    (exp_dir / "model/tokenizer", checkpoint / "tokenizer"),
+    (Path("src"), deploy_dir / "src"),
+]
+for src, dst in trees:
+    print(f"{prefix} Copy tree: {src} -> {dst}")
+    if not dry_run:
+        shutil.copytree(src, dst, dirs_exist_ok=True)
+if not dry_run:
+    print(f"\nDeployment files are ready under {deploy_dir}.")

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/constants.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Column name constants for the SQuAD v2.0 DataFrame & raw input field names.
+Benefits:
+    - Single source of truth: schema changes are centralized
+    - Safety: typos are caught at definition time rather than scattered string literals
+    - IDE support: `Col.` autocompletes all valid names, streamlining typing and making schemas self-documenting
+"""
+from enum import Enum
+from pathlib import Path
+# constants.py lives at: <repo>/src/utils/constants.py;
+# resolve() addresses symlink issues
+REPO_ROOT: Path = Path(__file__).resolve().parent.parent.parent
+DATA_DIR: Path = REPO_ROOT / "data"
+# TODO - Placeholder needs to be made smaller for experiments!
+TRAIN_DATA_PATH: Path = DATA_DIR / "train-v2.0.json"
+DEV_DATA_PATH: Path = DATA_DIR / "dev-v2.0.json"
+EXPERIMENTS_DIR: Path = REPO_ROOT / "experiments"
+DEBUG_SEED = 42
+class Col(Enum):
+    # Schema entries below are reused for raw keys with identical names
+    TITLE = "title"
+    QUESTION_ID = "id"
+    QUESTION = "question"
+    CONTEXT = "context"
+    ANSWER_TEXTS = "answers"
+    ANSWER_STARTS = "answer_starts"
+    IS_IMPOSSIBLE = "is_impossible"
+    NUM_ANSWERS = "num_answers"
+class RawField(Enum):
+    VERSION = "version"
+    DATA = "data"
+    PARAGRAPHS = "paragraphs"
+    QAS = "qas"
+    # QA-level answers (list of dicts with 'text' and 'answer_start')
+    ANSWERS = "answers"
+    ANSWER_TEXT = "text"
+    ANSWER_START = "answer_start"

src/utils/experiment_snapshot.py ADDED Viewed

	@@ -0,0 +1,101 @@

+"""
+A 'snapshot' object used for experiment tracking.
+Contains an experiment_name, the config used, the predictions produced,
+the resulting metrics, the model parameters and optional metadata.
+Benefits:
+    - Single, self-contained function call to persist an experiment run.
+    - Clean and automatic organization of experimental results facilitating model improvements.
+"""
+import time, json, torch
+from pathlib import Path
+from dataclasses import dataclass, asdict, is_dataclass
+from typing import Dict, Any, Optional
+from src.config.model_configs import BaseModelConfig
+from src.etl.types import Prediction
+from src.evaluation.metrics import Metrics
+from src.models.bert_based_model import BertBasedQAModel
+from src.models.base_qa_model import QAModel
+DEFAULT_ENCODING = "utf-8"
+@dataclass(frozen=True)
+class ExperimentSnapshot:
+    experiment_name: str
+    config: BaseModelConfig
+    predictions: Dict[str, Prediction]
+    metrics: Metrics
+    metadata: Optional[Dict[str, Any]] = None
+    model: Optional[QAModel] = None  # stores model reference
+    def _timestamped_dir(self, root: Path) -> Path:
+        ts = time.strftime("%Y%m%d_%H%M%S")
+        return root / f"{ts}_{self.experiment_name}"
+    def _as_config_dict(self) -> Dict[str, Any]:
+        return asdict(self.config) if is_dataclass(self.config) else dict(self.config)
+    def _manifest(self, run_id: str) -> Dict[str, Any]:
+        model_type = getattr(self.config, "MODEL_TYPE", None)
+        assert model_type is not None, "Unexpected empty model type."
+        mani = {
+            "run_id": run_id,
+            "experiment_name": self.experiment_name,
+            "model_type": model_type,
+            "artifacts": {
+                "config": "config.json",
+                "predictions": "predictions.json",
+                "metrics": "metrics.json",
+                "model": "model/",
+            },
+        }
+        # TODO - consider adding path to model checkpoints once we have those
+        if self.metadata:
+            mani["metadata"] = self.metadata  # pass-through, unchanged
+        return mani
+    def save(self, experiments_root: Path = Path("experiments")) -> Path:
+        run_dir = self._timestamped_dir(experiments_root)
+        # raise error if accidentally attempting to overwrite previous run
+        run_dir.mkdir(parents=True, exist_ok=False)
+        (run_dir / "config.json").write_text(
+            json.dumps(self._as_config_dict(), indent=2), encoding=DEFAULT_ENCODING
+        )
+        (run_dir / "predictions.json").write_text(
+            json.dumps(
+                Prediction.flatten_predicted_answers(predictions=self.predictions),
+                ensure_ascii=False,  # preserve original characters (e.g., accented characters etc.)
+                indent=2,
+            ),
+            encoding=DEFAULT_ENCODING,
+        )
+        (run_dir / "metrics.json").write_text(
+            json.dumps(self.metrics.export_for_exp_tracking(), indent=2),
+            encoding=DEFAULT_ENCODING,
+        )
+        if self.model is not None:
+            self._save_model(run_dir / "model")
+        manifest = self._manifest(run_dir.name)
+        (run_dir / "manifest.json").write_text(
+            json.dumps(manifest, indent=2), encoding=DEFAULT_ENCODING
+        )
+        return run_dir
+    def _save_model(self, model_path: Path) -> None:
+        """Save model weights and tokenizer."""
+        assert isinstance(
+            self.model, BertBasedQAModel
+        ), "Currently model saving is only supported for the BertBasedQAModel type."
+        model_path.mkdir(parents=True, exist_ok=True)
+        # Save model weights
+        torch.save(self.model.qa_module.state_dict(), model_path / "pytorch_model.bin")
+        # Save tokenizer
+        self.model.tokenizer.save_pretrained(model_path / "tokenizer")
+        print(f"Model saved to {model_path}")

src/utils/tune_threshold.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Minimal threshold tuning script that reuses existing pipeline code.
+Can be run from a Jupyter notebook.
+Note that tuning does NOT happen on the 'dev' set, which is considered to be
+an external, unseen dataset for objective performance measurement. Threshold
+tuning happens on the 'val' set (which is a small part of the SQuAD v2.0 training).
+"""
+import numpy as np
+from pathlib import Path
+from src.models.bert_based_model import BertBasedQAModel
+from src.etl.squad_v2_loader import load_squad_v2_df, df_to_examples_map
+from src.evaluation.evaluator import Evaluator
+from src.utils.constants import TRAIN_DATA_PATH, DEV_DATA_PATH
+from src.pipeline.qa_runner import split_by_title, DEFAULT_VAL_SET_FRACTION
+def tune_threshold_and_report_final_perf(
+    experiment_dir: str | Path,
+    config_class,
+    device: str,
+    threshold_range: np.ndarray = np.linspace(-2, 2, 9),
+):
+    """
+    Simple wrapper to tune threshold on validation (part of training) and
+    report final performance on dev set.
+    """
+    experiment_dir = Path(experiment_dir)
+    best_threshold, _, _, model = _tune_threshold_on_validation(
+        experiment_dir=experiment_dir,
+        config_class=config_class,
+        device=device,
+        threshold_range=threshold_range,
+    )
+    dev_examples = df_to_examples_map(load_squad_v2_df(DEV_DATA_PATH))
+    final_predictions = model.predict(dev_examples, threshold_override=best_threshold)
+    final_metrics = Evaluator().evaluate(final_predictions, dev_examples)
+    print(f"Final dev set performance: {final_metrics.export_for_exp_tracking()}")
+    return best_threshold, model
+def _tune_threshold_on_validation(
+    experiment_dir: Path,
+    config_class,
+    device: str,
+    threshold_range: np.ndarray,
+    val_fraction: float = DEFAULT_VAL_SET_FRACTION,
+):
+    print("=" * 70)
+    print("THRESHOLD TUNING ON VALIDATION SET")
+    print("=" * 70)
+    model = BertBasedQAModel.load_from_experiment(
+        experiment_dir, config_class, device=device
+    )
+    # TODO - can also store/load the exact val question IDs used during training,
+    # to be even more certain that we are tuning on the exact val set
+    df = load_squad_v2_df(TRAIN_DATA_PATH)
+    _, df_val = split_by_title(df, val_fraction)
+    val_examples = df_to_examples_map(df_val)
+    print(f"\nValidation set: {len(val_examples)} examples")
+    print(
+        f"Testing {len(threshold_range)} thresholds from {threshold_range.min():.1f} to {threshold_range.max():.1f}\n"
+    )
+    # Test each threshold
+    best_f1 = -1
+    best_threshold = None
+    best_metrics = None
+    results = []
+    for threshold in threshold_range:
+        predictions = model.predict(val_examples, threshold_override=threshold)
+        metrics = Evaluator().evaluate(predictions, val_examples)
+        results.append(
+            {"threshold": threshold, "em": metrics.exact_score, "f1": metrics.f1_score}
+        )
+        print(
+            f"Threshold: {threshold:6.2f} | EM: {metrics.exact_score:5.2f}% | F1: {metrics.f1_score:5.2f}%"
+        )
+        if metrics.f1_score > best_f1:
+            best_f1 = metrics.f1_score
+            best_threshold = threshold
+            best_metrics = metrics
+    # Type assertion
+    assert best_metrics is not None, "No thresholds tested!"
+    print("\n" + "=" * 70)
+    print("BEST THRESHOLD")
+    print("=" * 70)
+    print(f"Threshold: {best_threshold:.2f}")
+    print(f"EM: {best_metrics.exact_score:.2f}%")
+    print(f"F1: {best_metrics.f1_score:.2f}%")
+    print("=" * 70)
+    return best_threshold, best_metrics, results, model