from __future__ import annotations from typing import Iterable, Any def pick_text_column(columns: list[str]) -> str: """ Try to robustly choose the text field from various corpus schemas. """ candidates = ["text", "content", "document", "passage", "passages", "contents", "wiki_text"] for c in candidates: if c in columns: return c return columns[0] def pick_id_column(columns: list[str]) -> str | None: for c in ["id", "doc_id", "document_id", "passage_id", "pid"]: if c in columns: return c return None def iter_corpus_rows(ds) -> Iterable[dict[str, Any]]: for row in ds: yield row