chest2err / chest2err_collate.py

Fix loader: correct CADAD signature, bundle sentence-segmentation collate + concept vocab, decode_greedy path verified end-to-end

23c824a verified 6 days ago

Raw

History Blame Contribute Delete

3.85 kB

	"""Sentence segmentation + collate for chest2err inference.

	Stripped-down version of the in-tree training collate: keeps only what's needed
	to run greedy decoding on one (ref, cand) pair.
	"""
	from __future__ import annotations

	import re
	from typing import Any, Dict, List, Tuple

	import torch


	_SECTION_HEADER_RE = re.compile(r"\[[^\]]+\]")
	_BULLET_RE = re.compile(r"\n\s[-•]\s+")
	_SENT_BOUNDARY_RE = re.compile(r"(?<=\.)\s+(?=[A-Z])")


	def segment_report(text: str, min_len: int = 3) -> List[str]:
	if not text or not text.strip():
	return []
	t = _SECTION_HEADER_RE.sub("\n", text)
	t = _BULLET_RE.sub("\n", t)
	out: List[str] = []
	for line in t.split("\n"):
	line = line.strip()
	if len(line) < min_len:
	continue
	for sent in _SENT_BOUNDARY_RE.split(line):
	sent = sent.strip()
	if len(sent) >= min_len:
	out.append(sent)
	return out


	def encode_pair_for_decoder(
	tokenizer,
	ref_text: str,
	cand_text: str,
	max_length: int = 1280,
	ref_marker: str = "[REF]",
	cand_marker: str = "[PRED]",
	) -> Dict[str, Any]:
	ref_segs = segment_report(ref_text)
	cand_segs = segment_report(cand_text)

	ref_marker_ids = tokenizer.encode(ref_marker + " ", add_special_tokens=False)
	cand_marker_ids = tokenizer.encode(" " + cand_marker + " ", add_special_tokens=False)
	ref_seg_token_ids = [tokenizer.encode(s + " ", add_special_tokens=False) for s in ref_segs]
	cand_seg_token_ids = [tokenizer.encode(s + " ", add_special_tokens=False) for s in cand_segs]

	def _total_len(rs, cs):
	return (len(ref_marker_ids) + sum(len(x) for x in rs)
	+ len(cand_marker_ids) + sum(len(x) for x in cs))

	while _total_len(ref_seg_token_ids, cand_seg_token_ids) > max_length and cand_seg_token_ids:
	cand_seg_token_ids.pop(); cand_segs = cand_segs[:-1]
	while _total_len(ref_seg_token_ids, cand_seg_token_ids) > max_length and ref_seg_token_ids:
	ref_seg_token_ids.pop(); ref_segs = ref_segs[:-1]

	input_ids: List[int] = []
	input_ids.extend(ref_marker_ids)
	ref_ranges: List[Tuple[int, int]] = []
	for ids in ref_seg_token_ids:
	s = len(input_ids); input_ids.extend(ids); ref_ranges.append((s, len(input_ids)))
	input_ids.extend(cand_marker_ids)
	cand_ranges: List[Tuple[int, int]] = []
	for ids in cand_seg_token_ids:
	s = len(input_ids); input_ids.extend(ids); cand_ranges.append((s, len(input_ids)))

	return {
	"input_ids": input_ids,
	"ref_seg_ranges": ref_ranges,
	"cand_seg_ranges": cand_ranges,
	"ref_segs": ref_segs,
	"cand_segs": cand_segs,
	}


	def collate_decoder_batch(items: List[Dict[str, Any]], pad_token_id: int = 0) -> Dict[str, torch.Tensor]:
	T = max(len(it["input_ids"]) for it in items)
	Sr = max(max(len(it["ref_seg_ranges"]), 1) for it in items)
	Sc = max(max(len(it["cand_seg_ranges"]), 1) for it in items)
	B = len(items)

	input_ids = torch.full((B, T), pad_token_id, dtype=torch.long)
	attention_mask = torch.zeros((B, T), dtype=torch.long)
	ref_seg_token_mask = torch.zeros((B, Sr, T), dtype=torch.bool)
	cand_seg_token_mask = torch.zeros((B, Sc, T), dtype=torch.bool)

	for b, it in enumerate(items):
	ids = it["input_ids"]; L = len(ids)
	input_ids[b, :L] = torch.tensor(ids, dtype=torch.long)
	attention_mask[b, :L] = 1
	for s, (a, e) in enumerate(it["ref_seg_ranges"]):
	ref_seg_token_mask[b, s, a:e] = True
	for s, (a, e) in enumerate(it["cand_seg_ranges"]):
	cand_seg_token_mask[b, s, a:e] = True

	return {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"ref_seg_token_mask": ref_seg_token_mask,
	"cand_seg_token_mask": cand_seg_token_mask,
	}