Add CLI script for processing JSONL files and generating binary predictions with optional Excel output

Files changed (1) hide show

sdg_predict/cli_conversion.py +86 -0

sdg_predict/cli_conversion.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import argparse
+import json
+import pandas as pd
+import logging
+def binary_from_softmax(prediction, cap_class0=0.5):
+    """
+    Given a softmax-style prediction list, computes binary scores
+    for all non-class-0 labels, contrasted against (possibly capped) class-0 score.
+    Args:
+        prediction: list of {"label": str, "score": float}
+        cap_class0: float, maximum score allowed for label "0"
+    Returns:
+        dict of {label: binary_score}
+    """
+    score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0)
+    score_0 = min(score_0, cap_class0)
+    binary_predictions = {}
+    for entry in prediction:
+        label = entry["label"]
+        if label == "0":
+            continue
+        score = entry["score"]
+        binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0
+        binary_predictions[label] = round(binary_score, 3)
+    return binary_predictions
+def process_jsonl(input_file, output_file, cap_class0, excel_file=None):
+    transformed_data = []
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+        for line in infile:
+            entry = json.loads(line)
+            prediction = entry.get("prediction", [])
+            entry["binary_predictions"] = binary_from_softmax(prediction, cap_class0)
+            outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
+            # Prepare data for Excel output
+            transformed_row = {
+                "publication_zora_id": entry.get("id"),
+                **{
+                    f"dvdblk_sdg{sdg}": entry["binary_predictions"].get(str(sdg), 0)
+                    for sdg in range(1, 18)
+                },
+            }
+            transformed_data.append(transformed_row)
+    if excel_file:
+        if not excel_file.endswith(".xlsx"):
+            raise ValueError("Excel file must have the .xlsx extension")
+        logging.info("Writing Excel output to %s", excel_file)
+        df_transformed = pd.DataFrame(transformed_data)
+        df_transformed.to_excel(excel_file, index=False)
+        logging.info("Excel output written to %s", excel_file)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process JSONL file and compute binary predictions."
+    )
+    parser.add_argument("input_file", type=str, help="Path to the input JSONL file.")
+    parser.add_argument("output_file", type=str, help="Path to the output JSONL file.")
+    parser.add_argument(
+        "--cap_class0",
+        type=float,
+        default=0.5,
+        help="Maximum score allowed for class 0.",
+    )
+    parser.add_argument(
+        "--excel",
+        type=str,
+        help="Path to the Excel file for binary predictions (optional).",
+    )
+    args = parser.parse_args()
+    process_jsonl(args.input_file, args.output_file, args.cap_class0, args.excel)
+if __name__ == "__main__":
+    main()