Spaces:
Sleeping
Sleeping
File size: 675 Bytes
7c2e31a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
from __future__ import annotations
from typing import Iterable, Any
def pick_text_column(columns: list[str]) -> str:
"""
Try to robustly choose the text field from various corpus schemas.
"""
candidates = ["text", "content", "document", "passage", "passages", "contents", "wiki_text"]
for c in candidates:
if c in columns:
return c
return columns[0]
def pick_id_column(columns: list[str]) -> str | None:
for c in ["id", "doc_id", "document_id", "passage_id", "pid"]:
if c in columns:
return c
return None
def iter_corpus_rows(ds) -> Iterable[dict[str, Any]]:
for row in ds:
yield row
|