File size: 2,874 Bytes
fa342d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import argparse
import json
import pandas as pd
import logging
def binary_from_softmax(prediction, cap_class0=0.5):
"""
Given a softmax-style prediction list, computes binary scores
for all non-class-0 labels, contrasted against (possibly capped) class-0 score.
Args:
prediction: list of {"label": str, "score": float}
cap_class0: float, maximum score allowed for label "0"
Returns:
dict of {label: binary_score}
"""
score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0)
score_0 = min(score_0, cap_class0)
binary_predictions = {}
for entry in prediction:
label = entry["label"]
if label == "0":
continue
score = entry["score"]
binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0
binary_predictions[label] = round(binary_score, 3)
return binary_predictions
def process_jsonl(input_file, output_file, cap_class0, excel_file=None):
transformed_data = []
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
for line in infile:
entry = json.loads(line)
prediction = entry.get("prediction", [])
entry["binary_predictions"] = binary_from_softmax(prediction, cap_class0)
outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
# Prepare data for Excel output
transformed_row = {
"publication_zora_id": entry.get("id"),
**{
f"dvdblk_sdg{sdg}": entry["binary_predictions"].get(str(sdg), 0)
for sdg in range(1, 18)
},
}
transformed_data.append(transformed_row)
if excel_file:
if not excel_file.endswith(".xlsx"):
raise ValueError("Excel file must have the .xlsx extension")
logging.info("Writing Excel output to %s", excel_file)
df_transformed = pd.DataFrame(transformed_data)
df_transformed.to_excel(excel_file, index=False)
logging.info("Excel output written to %s", excel_file)
def main():
parser = argparse.ArgumentParser(
description="Process JSONL file and compute binary predictions."
)
parser.add_argument("input_file", type=str, help="Path to the input JSONL file.")
parser.add_argument("output_file", type=str, help="Path to the output JSONL file.")
parser.add_argument(
"--cap_class0",
type=float,
default=0.5,
help="Maximum score allowed for class 0.",
)
parser.add_argument(
"--excel",
type=str,
help="Path to the Excel file for binary predictions (optional).",
)
args = parser.parse_args()
process_jsonl(args.input_file, args.output_file, args.cap_class0, args.excel)
if __name__ == "__main__":
main()
|