DunasAnastasiia
Initial commit (Xet)
7c2e31a
raw
history blame contribute delete
675 Bytes
from __future__ import annotations
from typing import Iterable, Any
def pick_text_column(columns: list[str]) -> str:
"""
Try to robustly choose the text field from various corpus schemas.
"""
candidates = ["text", "content", "document", "passage", "passages", "contents", "wiki_text"]
for c in candidates:
if c in columns:
return c
return columns[0]
def pick_id_column(columns: list[str]) -> str | None:
for c in ["id", "doc_id", "document_id", "passage_id", "pid"]:
if c in columns:
return c
return None
def iter_corpus_rows(ds) -> Iterable[dict[str, Any]]:
for row in ds:
yield row