sdg-scibert-zo_up / sdg_predict /cli_conversion.py

Simon Clematide

Add CLI script for processing JSONL files and generating binary predictions with optional Excel output

fa342d2 9 months ago

2.87 kB

	import argparse
	import json
	import pandas as pd
	import logging


	def binary_from_softmax(prediction, cap_class0=0.5):
	"""
	Given a softmax-style prediction list, computes binary scores
	for all non-class-0 labels, contrasted against (possibly capped) class-0 score.

	Args:
	prediction: list of {"label": str, "score": float}
	cap_class0: float, maximum score allowed for label "0"

	Returns:
	dict of {label: binary_score}
	"""
	score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0)
	score_0 = min(score_0, cap_class0)

	binary_predictions = {}
	for entry in prediction:
	label = entry["label"]
	if label == "0":
	continue
	score = entry["score"]
	binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0
	binary_predictions[label] = round(binary_score, 3)

	return binary_predictions


	def process_jsonl(input_file, output_file, cap_class0, excel_file=None):
	transformed_data = []
	with open(input_file, "r") as infile, open(output_file, "w") as outfile:
	for line in infile:
	entry = json.loads(line)
	prediction = entry.get("prediction", [])
	entry["binary_predictions"] = binary_from_softmax(prediction, cap_class0)
	outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")

	# Prepare data for Excel output
	transformed_row = {
	"publication_zora_id": entry.get("id"),
	**{
	f"dvdblk_sdg{sdg}": entry["binary_predictions"].get(str(sdg), 0)
	for sdg in range(1, 18)
	},
	}
	transformed_data.append(transformed_row)

	if excel_file:
	if not excel_file.endswith(".xlsx"):
	raise ValueError("Excel file must have the .xlsx extension")
	logging.info("Writing Excel output to %s", excel_file)
	df_transformed = pd.DataFrame(transformed_data)
	df_transformed.to_excel(excel_file, index=False)
	logging.info("Excel output written to %s", excel_file)


	def main():
	parser = argparse.ArgumentParser(
	description="Process JSONL file and compute binary predictions."
	)
	parser.add_argument("input_file", type=str, help="Path to the input JSONL file.")
	parser.add_argument("output_file", type=str, help="Path to the output JSONL file.")
	parser.add_argument(
	"--cap_class0",
	type=float,
	default=0.5,
	help="Maximum score allowed for class 0.",
	)
	parser.add_argument(
	"--excel",
	type=str,
	help="Path to the Excel file for binary predictions (optional).",
	)

	args = parser.parse_args()

	process_jsonl(args.input_file, args.output_file, args.cap_class0, args.excel)


	if __name__ == "__main__":
	main()