File size: 675 Bytes
7c2e31a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from __future__ import annotations

from typing import Iterable, Any


def pick_text_column(columns: list[str]) -> str:
    """
    Try to robustly choose the text field from various corpus schemas.
    """
    candidates = ["text", "content", "document", "passage", "passages", "contents", "wiki_text"]
    for c in candidates:
        if c in columns:
            return c
    return columns[0]


def pick_id_column(columns: list[str]) -> str | None:
    for c in ["id", "doc_id", "document_id", "passage_id", "pid"]:
        if c in columns:
            return c
    return None


def iter_corpus_rows(ds) -> Iterable[dict[str, Any]]:
    for row in ds:
        yield row