Refactor CLI prediction script to enhance argument parsing and modularize inference logic. Add excel generation

Files changed (2) hide show

sdg_predict/cli_predict.py +157 -64
sdg_predict/inference.py +70 -0

sdg_predict/cli_predict.py CHANGED Viewed

@@ -2,11 +2,17 @@
 import argparse
 import json
 from pathlib import Path
-from tqdm import tqdm
-import sys
-import torch
-from sdg_predict.inference import load_model, predict
 import logging
 # Set up logging
 logging.basicConfig(
@@ -14,80 +20,138 @@ logging.basicConfig(
 )
-def main():
     parser = argparse.ArgumentParser(
         description="Batch inference using Hugging Face model."
     )
-    parser.add_argument("input", type=Path, help="Input JSONL file")
     parser.add_argument(
-        "--key", type=str, default="text", help="JSON key with text input"
     )
-    parser.add_argument("--batch_size", "-b", type=int, default=8, help="Batch size")
     parser.add_argument(
         "--model",
         type=str,
         default="simon-clmtd/sdg-scibert-zo_up",
-        help="Model name on the Hub",
     )
     parser.add_argument(
-        "--top1", action="store_true", help="Return only top prediction"
     )
     parser.add_argument(
-        "--output", type=Path, help="Output file (optional, otherwise stdout)"
     )
-    args = parser.parse_args()
-    # -------------------------------
-    # 1. Device Setup (MPS support for Apple Silicon)
-    # -------------------------------
-    if torch.backends.mps.is_available():
-        device = torch.device("mps")
-        logging.info("Using MPS device")
-    elif torch.cuda.is_available():
-        device = torch.device("cuda")
-        logging.info("Using CUDA device")
-    else:
-        device = torch.device("cpu")
-        logging.info("Using CPU device")
-    # device = torch.device("cpu")
-    logging.info("Loading model: %s", args.model)
-    tokenizer, model = load_model(args.model, device)
-    logging.info("Model loaded successfully")
-    with args.input.open() as f:
-        texts = []
-        rows = []
-        for line in f:
-            row = json.loads(line)
-            if args.key not in row:
-                continue
-            texts.append(row[args.key])
-            logging.debug("Text: %s", row[args.key])
-            rows.append(row)
-    logging.info("Starting predictions on %d texts", len(texts))
-    predictions = predict(
-        texts,
-        tokenizer,
-        model,
-        device,
-        batch_size=args.batch_size,
-        return_all_scores=not args.top1,
     )
-    logging.info("Predictions completed")
-    output_stream = args.output.open("w") if args.output else sys.stdout
     for row, pred in zip(rows, predictions):
-        # Compute binary probabilities for labels 1-17
-        binary_predictions = {}
-        for label_data in pred:
-            label_data["score"] = round(
-                label_data["score"], 3
-            )  # Round prediction scores to 3 decimal places
-            label = int(label_data["label"])
-            if 1 <= label <= 17:
-                binary_prob = label_data["score"]  # Already rounded
-                binary_predictions[str(label)] = binary_prob
         output_row = {
             "id": row.get("id"),
@@ -95,11 +159,40 @@ def main():
             "prediction": pred,
             "binary_predictions": binary_predictions,
         }
         print(json.dumps(output_row, ensure_ascii=False), file=output_stream)
-    if args.output:
         output_stream.close()
-        logging.info("Output written to %s", args.output)
 if __name__ == "__main__":
-    main()

 import argparse
 import json
 from pathlib import Path
+from typing import List, Dict, Union
+from sdg_predict.inference import (
+    load_model_and_tokenizer,
+    load_input_data,
+    perform_predictions,
+    setup_device,
+    binary_from_softmax,
+)
 import logging
+import pandas as pd
 # Set up logging
 logging.basicConfig(
 )
+def parse_arguments() -> argparse.Namespace:
+    """
+    Parse command-line arguments for the script.
+    Returns:
+        Parsed arguments as a Namespace object.
+    """
     parser = argparse.ArgumentParser(
         description="Batch inference using Hugging Face model."
     )
+    parser.add_argument("input", type=Path, help="Input JSONL file (default: None)")
     parser.add_argument(
+        "--key",
+        type=str,
+        default="text",
+        help="JSON key with text input (default: 'text')",
+    )
+    parser.add_argument(
+        "--batch_size", "-b", type=int, default=8, help="Batch size (default: 8)"
     )
     parser.add_argument(
         "--model",
         type=str,
         default="simon-clmtd/sdg-scibert-zo_up",
+        help="Model name on the Hub (default: 'simon-clmtd/sdg-scibert-zo_up')",
     )
     parser.add_argument(
+        "--top1",
+        action="store_true",
+        help="Return only top prediction (default: False)",
     )
     parser.add_argument(
+        "--output",
+        "-o",
+        type=Path,
+        help="Output file (default: None, otherwise stdout)",
     )
+    parser.add_argument(
+        "--binarization",
+        type=str,
+        choices=["one-vs-all", "one-vs-0"],
+        default="one-vs-0",
+        help="Binarization method: 'one-vs-all' or 'one-vs-0' (default: 'one-vs-0')",
+    )
+    parser.add_argument(
+        "--sdg0-cap-prob",
+        type=float,
+        default=0.5,
+        help=(
+            "Maximum score allowed for class 0 in 'one-vs-0' binarization (default:"
+            " 0.5)"
+        ),
     )
+    parser.add_argument(
+        "--excel",
+        "-e",
+        type=Path,
+        help="Path to the Excel file for binary predictions (optional)",
+    )
+    return parser.parse_args()
+def main(
+    input: Path,
+    key: str,
+    batch_size: int,
+    model: str,
+    top1: bool,
+    output: Union[Path, None],
+    binarization: str,
+    sdg0_cap_prob: float,
+    excel: Union[Path, None],
+) -> None:
+    """
+    Main function to perform batch inference using a Hugging Face model.
+    Args:
+        input: Path to the input JSONL file.
+        key: JSON key containing the text input.
+        batch_size: Batch size for inference.
+        model: Model name or path.
+        top1: Whether to return only the top prediction.
+        output: Path to the output file (optional).
+        binarization: Binarization method ('one-vs-all' or 'one-vs-0').
+        sdg0_cap_prob: Maximum score allowed for class 0 in 'one-vs-0' binarization.
+        excel: Path to the Excel file for binary predictions (optional).
+    """
+    logging.info("Starting main function")
+    device = setup_device()
+    tokenizer, model = load_model_and_tokenizer(model, device)
+    texts, rows = load_input_data(input, key)
+    predictions = perform_predictions(texts, tokenizer, model, device, batch_size, top1)
+    write_output(rows, predictions, output, binarization, sdg0_cap_prob, excel)
+    logging.info("Main function completed")
+def write_output(
+    rows: List[Dict],
+    predictions: List,
+    output: Union[Path, None],
+    binarization: str,
+    sdg0_cap_prob: float,
+    excel: Union[Path, None] = None,
+) -> None:
+    """
+    Write the predictions to the output file or stdout, and optionally to an Excel file.
+    Args:
+        rows: List of input rows.
+        predictions: List of predictions.
+        output: Path to the output file (optional).
+        binarization: Binarization method ('one-vs-all' or 'one-vs-0').
+        sdg0_cap_prob: Maximum score allowed for class 0 in 'one-vs-0' binarization.
+        excel: Path to the Excel file (optional).
+    """
+    logging.info("Writing output to %s", output or "stdout")
+    output_stream = output.open("w") if output else None
+    transformed_data = []
     for row, pred in zip(rows, predictions):
+        if binarization == "one-vs-all":
+            binary_predictions = {
+                str(label): round(
+                    next((x["score"] for x in pred if int(x["label"]) == label), 0), 3
+                )
+                for label in range(1, 18)
+            }
+        elif binarization == "one-vs-0":
+            binary_predictions = binary_from_softmax(pred, sdg0_cap_prob)
         output_row = {
             "id": row.get("id"),
             "prediction": pred,
             "binary_predictions": binary_predictions,
         }
+        transformed_data.append(
+            {
+                "publication_zora_id": row.get("id"),
+                **{
+                    f"dvdblk_sdg{sdg}": binary_predictions.get(str(sdg), 0)
+                    for sdg in range(1, 18)
+                },
+            }
+        )
         print(json.dumps(output_row, ensure_ascii=False), file=output_stream)
+    if output:
         output_stream.close()
+        logging.info("Output written to %s", output)
+    if excel:
+        logging.info("Writing Excel output to %s", excel)
+        df_transformed = pd.DataFrame(transformed_data)
+        df_transformed.to_excel(excel, index=False)
+        logging.info("Excel output written to %s", excel)
+    logging.info("Output writing completed")
 if __name__ == "__main__":
+    args = parse_arguments()
+    main(
+        input=args.input,
+        key=args.key,
+        batch_size=args.batch_size,
+        model=args.model,
+        top1=args.top1,
+        output=args.output,
+        binarization=args.binarization,
+        sdg0_cap_prob=args.sdg0_cap_prob,
+        excel=args.excel,
+    )

sdg_predict/inference.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
 import logging
 def load_model(model_name, device):
@@ -43,3 +44,72 @@ def predict(texts, tokenizer, model, device, batch_size=8, return_all_scores=Tru
             )  # Round top score to 3 decimal places
     return results

 from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 import torch
 import logging
+import json
 def load_model(model_name, device):
             )  # Round top score to 3 decimal places
     return results
+def binary_from_softmax(prediction, cap_class0=0.5):
+    score_0 = next((x["score"] for x in prediction if x["label"] == "0"), 0.0)
+    score_0 = min(score_0, cap_class0)
+    binary_predictions = {
+        label: 0.0 for label in map(str, range(1, 18))
+    }  # Initialize all labels to 0.0
+    for entry in prediction:
+        label = entry["label"]
+        if label == "0":
+            continue
+        score = entry["score"]
+        binary_score = score / (score + score_0) if (score + score_0) > 0 else 0.0
+        binary_predictions[label] = round(binary_score, 3)
+    return binary_predictions
+def setup_device():
+    logging.info("Setting up device")
+    if torch.backends.mps.is_available():
+        logging.info("Using MPS device")
+        return torch.device("mps")
+    elif torch.cuda.is_available():
+        logging.info("Using CUDA device")
+        return torch.device("cuda")
+    else:
+        logging.info("Using CPU device")
+        return torch.device("cpu")
+def load_model_and_tokenizer(model_name, device):
+    logging.info("Loading model: %s", model_name)
+    tokenizer, model = load_model(model_name, device)
+    logging.info("Model loaded successfully")
+    return tokenizer, model
+def load_input_data(input, key):
+    logging.info("Loading input data from %s", input)
+    texts = []
+    rows = []
+    with input.open() as f:
+        for line in f:
+            row = json.loads(line)
+            if key not in row:
+                continue
+            texts.append(row[key])
+            logging.debug("Text: %s", row[key])
+            rows.append(row)
+    logging.info("Loaded %d rows of input data", len(rows))
+    return texts, rows
+def perform_predictions(texts, tokenizer, model, device, batch_size, top1):
+    logging.info("Starting predictions on %d texts", len(texts))
+    predictions = predict(
+        texts,
+        tokenizer,
+        model,
+        device,
+        batch_size=batch_size,
+        return_all_scores=not top1,
+    )
+    logging.info("Predictions completed")
+    return predictions