File size: 3,850 Bytes

23c824a

"""Sentence segmentation + collate for chest2err inference.

Stripped-down version of the in-tree training collate: keeps only what's needed
to run greedy decoding on one (ref, cand) pair.
"""
from __future__ import annotations

import re
from typing import Any, Dict, List, Tuple

import torch


_SECTION_HEADER_RE = re.compile(r"\[[^\]]+\]")
_BULLET_RE = re.compile(r"\n\s*[-*•]\s+")
_SENT_BOUNDARY_RE = re.compile(r"(?<=\.)\s+(?=[A-Z])")


def segment_report(text: str, min_len: int = 3) -> List[str]:
    if not text or not text.strip():
        return []
    t = _SECTION_HEADER_RE.sub("\n", text)
    t = _BULLET_RE.sub("\n", t)
    out: List[str] = []
    for line in t.split("\n"):
        line = line.strip()
        if len(line) < min_len:
            continue
        for sent in _SENT_BOUNDARY_RE.split(line):
            sent = sent.strip()
            if len(sent) >= min_len:
                out.append(sent)
    return out


def encode_pair_for_decoder(
    tokenizer,
    ref_text: str,
    cand_text: str,
    max_length: int = 1280,
    ref_marker: str = "[REF]",
    cand_marker: str = "[PRED]",
) -> Dict[str, Any]:
    ref_segs = segment_report(ref_text)
    cand_segs = segment_report(cand_text)

    ref_marker_ids = tokenizer.encode(ref_marker + " ", add_special_tokens=False)
    cand_marker_ids = tokenizer.encode(" " + cand_marker + " ", add_special_tokens=False)
    ref_seg_token_ids = [tokenizer.encode(s + " ", add_special_tokens=False) for s in ref_segs]
    cand_seg_token_ids = [tokenizer.encode(s + " ", add_special_tokens=False) for s in cand_segs]

    def _total_len(rs, cs):
        return (len(ref_marker_ids) + sum(len(x) for x in rs)
                + len(cand_marker_ids) + sum(len(x) for x in cs))

    while _total_len(ref_seg_token_ids, cand_seg_token_ids) > max_length and cand_seg_token_ids:
        cand_seg_token_ids.pop(); cand_segs = cand_segs[:-1]
    while _total_len(ref_seg_token_ids, cand_seg_token_ids) > max_length and ref_seg_token_ids:
        ref_seg_token_ids.pop(); ref_segs = ref_segs[:-1]

    input_ids: List[int] = []
    input_ids.extend(ref_marker_ids)
    ref_ranges: List[Tuple[int, int]] = []
    for ids in ref_seg_token_ids:
        s = len(input_ids); input_ids.extend(ids); ref_ranges.append((s, len(input_ids)))
    input_ids.extend(cand_marker_ids)
    cand_ranges: List[Tuple[int, int]] = []
    for ids in cand_seg_token_ids:
        s = len(input_ids); input_ids.extend(ids); cand_ranges.append((s, len(input_ids)))

    return {
        "input_ids": input_ids,
        "ref_seg_ranges": ref_ranges,
        "cand_seg_ranges": cand_ranges,
        "ref_segs": ref_segs,
        "cand_segs": cand_segs,
    }


def collate_decoder_batch(items: List[Dict[str, Any]], pad_token_id: int = 0) -> Dict[str, torch.Tensor]:
    T = max(len(it["input_ids"]) for it in items)
    Sr = max(max(len(it["ref_seg_ranges"]), 1) for it in items)
    Sc = max(max(len(it["cand_seg_ranges"]), 1) for it in items)
    B = len(items)

    input_ids = torch.full((B, T), pad_token_id, dtype=torch.long)
    attention_mask = torch.zeros((B, T), dtype=torch.long)
    ref_seg_token_mask = torch.zeros((B, Sr, T), dtype=torch.bool)
    cand_seg_token_mask = torch.zeros((B, Sc, T), dtype=torch.bool)

    for b, it in enumerate(items):
        ids = it["input_ids"]; L = len(ids)
        input_ids[b, :L] = torch.tensor(ids, dtype=torch.long)
        attention_mask[b, :L] = 1
        for s, (a, e) in enumerate(it["ref_seg_ranges"]):
            ref_seg_token_mask[b, s, a:e] = True
        for s, (a, e) in enumerate(it["cand_seg_ranges"]):
            cand_seg_token_mask[b, s, a:e] = True

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "ref_seg_token_mask": ref_seg_token_mask,
        "cand_seg_token_mask": cand_seg_token_mask,
    }