File size: 2,874 Bytes
fa342d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import argparse
import json
import pandas as pd
import logging


def binary_from_softmax(prediction, cap_class0=0.5):
    """
    Given a softmax-style prediction list, computes binary scores
    for all non-class-0 labels, contrasted against (possibly capped) class-0 score.

    Args:
        prediction: list of {"label": str, "score": float}
        cap_class0: float, maximum score allowed for label "0"

    Returns:
        dict of {label: binary_score}
    """
    score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0)
    score_0 = min(score_0, cap_class0)

    binary_predictions = {}
    for entry in prediction:
        label = entry["label"]
        if label == "0":
            continue
        score = entry["score"]
        binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0
        binary_predictions[label] = round(binary_score, 3)

    return binary_predictions


def process_jsonl(input_file, output_file, cap_class0, excel_file=None):
    transformed_data = []
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        for line in infile:
            entry = json.loads(line)
            prediction = entry.get("prediction", [])
            entry["binary_predictions"] = binary_from_softmax(prediction, cap_class0)
            outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")

            # Prepare data for Excel output
            transformed_row = {
                "publication_zora_id": entry.get("id"),
                **{
                    f"dvdblk_sdg{sdg}": entry["binary_predictions"].get(str(sdg), 0)
                    for sdg in range(1, 18)
                },
            }
            transformed_data.append(transformed_row)

    if excel_file:
        if not excel_file.endswith(".xlsx"):
            raise ValueError("Excel file must have the .xlsx extension")
        logging.info("Writing Excel output to %s", excel_file)
        df_transformed = pd.DataFrame(transformed_data)
        df_transformed.to_excel(excel_file, index=False)
        logging.info("Excel output written to %s", excel_file)


def main():
    parser = argparse.ArgumentParser(
        description="Process JSONL file and compute binary predictions."
    )
    parser.add_argument("input_file", type=str, help="Path to the input JSONL file.")
    parser.add_argument("output_file", type=str, help="Path to the output JSONL file.")
    parser.add_argument(
        "--cap_class0",
        type=float,
        default=0.5,
        help="Maximum score allowed for class 0.",
    )
    parser.add_argument(
        "--excel",
        type=str,
        help="Path to the Excel file for binary predictions (optional).",
    )

    args = parser.parse_args()

    process_jsonl(args.input_file, args.output_file, args.cap_class0, args.excel)


if __name__ == "__main__":
    main()