File size: 1,717 Bytes
9302284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from __future__ import annotations

from typing import Dict, List, Sequence, Tuple

import numpy as np


def normalize_text(text: str) -> str:
    return " ".join((text or "").split())


def to_label_vector(example: Dict, label_fields: Sequence[str]) -> List[float]:
    vec: List[float] = []
    for lf in label_fields:
        val = example.get(lf, 0)
        try:
            vec.append(float(val))
        except Exception:
            vec.append(0.0)
    return vec


def is_all_zero(vec: Sequence[float]) -> bool:
    return all(float(v) <= 0.0 for v in vec)


def downsample_negatives(
    texts: List[str],
    label_vectors: List[List[float]],
    ratio_keep: float,
    seed: int = 42,
) -> Tuple[List[str], List[List[float]]]:
    """Keep all positives, and only keep a ratio of all-zero negatives."""
    if ratio_keep >= 1.0:
        return texts, label_vectors

    rng = np.random.default_rng(seed)
    keep_texts: List[str] = []
    keep_labels: List[List[float]] = []

    neg_indices = [i for i, y in enumerate(label_vectors) if is_all_zero(y)]
    pos_indices = [i for i, y in enumerate(label_vectors) if not is_all_zero(y)]

    keep_neg = int(len(neg_indices) * ratio_keep)
    chosen_neg = set(rng.choice(neg_indices, size=keep_neg, replace=False).tolist()) if keep_neg > 0 else set()

    for i in pos_indices:
        keep_texts.append(texts[i])
        keep_labels.append(label_vectors[i])
    for i in chosen_neg:
        keep_texts.append(texts[i])
        keep_labels.append(label_vectors[i])

    # shuffle
    idx = rng.permutation(len(keep_texts)).tolist()
    keep_texts = [keep_texts[i] for i in idx]
    keep_labels = [keep_labels[i] for i in idx]
    return keep_texts, keep_labels