import argparse import json import pandas as pd import logging def binary_from_softmax(prediction, cap_class0=0.5): """ Given a softmax-style prediction list, computes binary scores for all non-class-0 labels, contrasted against (possibly capped) class-0 score. Args: prediction: list of {"label": str, "score": float} cap_class0: float, maximum score allowed for label "0" Returns: dict of {label: binary_score} """ score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0) score_0 = min(score_0, cap_class0) binary_predictions = {} for entry in prediction: label = entry["label"] if label == "0": continue score = entry["score"] binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0 binary_predictions[label] = round(binary_score, 3) return binary_predictions def process_jsonl(input_file, output_file, cap_class0, excel_file=None): transformed_data = [] with open(input_file, "r") as infile, open(output_file, "w") as outfile: for line in infile: entry = json.loads(line) prediction = entry.get("prediction", []) entry["binary_predictions"] = binary_from_softmax(prediction, cap_class0) outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") # Prepare data for Excel output transformed_row = { "publication_zora_id": entry.get("id"), **{ f"dvdblk_sdg{sdg}": entry["binary_predictions"].get(str(sdg), 0) for sdg in range(1, 18) }, } transformed_data.append(transformed_row) if excel_file: if not excel_file.endswith(".xlsx"): raise ValueError("Excel file must have the .xlsx extension") logging.info("Writing Excel output to %s", excel_file) df_transformed = pd.DataFrame(transformed_data) df_transformed.to_excel(excel_file, index=False) logging.info("Excel output written to %s", excel_file) def main(): parser = argparse.ArgumentParser( description="Process JSONL file and compute binary predictions." ) parser.add_argument("input_file", type=str, help="Path to the input JSONL file.") parser.add_argument("output_file", type=str, help="Path to the output JSONL file.") parser.add_argument( "--cap_class0", type=float, default=0.5, help="Maximum score allowed for class 0.", ) parser.add_argument( "--excel", type=str, help="Path to the Excel file for binary predictions (optional).", ) args = parser.parse_args() process_jsonl(args.input_file, args.output_file, args.cap_class0, args.excel) if __name__ == "__main__": main()