Simon Clematide
Add CLI script for processing JSONL files and generating binary predictions with optional Excel output
fa342d2
| import argparse | |
| import json | |
| import pandas as pd | |
| import logging | |
| def binary_from_softmax(prediction, cap_class0=0.5): | |
| """ | |
| Given a softmax-style prediction list, computes binary scores | |
| for all non-class-0 labels, contrasted against (possibly capped) class-0 score. | |
| Args: | |
| prediction: list of {"label": str, "score": float} | |
| cap_class0: float, maximum score allowed for label "0" | |
| Returns: | |
| dict of {label: binary_score} | |
| """ | |
| score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0) | |
| score_0 = min(score_0, cap_class0) | |
| binary_predictions = {} | |
| for entry in prediction: | |
| label = entry["label"] | |
| if label == "0": | |
| continue | |
| score = entry["score"] | |
| binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0 | |
| binary_predictions[label] = round(binary_score, 3) | |
| return binary_predictions | |
| def process_jsonl(input_file, output_file, cap_class0, excel_file=None): | |
| transformed_data = [] | |
| with open(input_file, "r") as infile, open(output_file, "w") as outfile: | |
| for line in infile: | |
| entry = json.loads(line) | |
| prediction = entry.get("prediction", []) | |
| entry["binary_predictions"] = binary_from_softmax(prediction, cap_class0) | |
| outfile.write(json.dumps(entry, ensure_ascii=False) + "\n") | |
| # Prepare data for Excel output | |
| transformed_row = { | |
| "publication_zora_id": entry.get("id"), | |
| **{ | |
| f"dvdblk_sdg{sdg}": entry["binary_predictions"].get(str(sdg), 0) | |
| for sdg in range(1, 18) | |
| }, | |
| } | |
| transformed_data.append(transformed_row) | |
| if excel_file: | |
| if not excel_file.endswith(".xlsx"): | |
| raise ValueError("Excel file must have the .xlsx extension") | |
| logging.info("Writing Excel output to %s", excel_file) | |
| df_transformed = pd.DataFrame(transformed_data) | |
| df_transformed.to_excel(excel_file, index=False) | |
| logging.info("Excel output written to %s", excel_file) | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Process JSONL file and compute binary predictions." | |
| ) | |
| parser.add_argument("input_file", type=str, help="Path to the input JSONL file.") | |
| parser.add_argument("output_file", type=str, help="Path to the output JSONL file.") | |
| parser.add_argument( | |
| "--cap_class0", | |
| type=float, | |
| default=0.5, | |
| help="Maximum score allowed for class 0.", | |
| ) | |
| parser.add_argument( | |
| "--excel", | |
| type=str, | |
| help="Path to the Excel file for binary predictions (optional).", | |
| ) | |
| args = parser.parse_args() | |
| process_jsonl(args.input_file, args.output_file, args.cap_class0, args.excel) | |
| if __name__ == "__main__": | |
| main() | |