File size: 3,743 Bytes

b95938c

import argparse
import json
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)


def chunk_and_classify(text, classifier, tokenizer, max_len=512, stride=50):
    """
    Splits a given text into overlapping chunks, classifies each chunk using a
    provided classifier, and computes the average classification scores for
    each label across all chunks.

    Args:
        text (str): The input text to be chunked and classified.
        classifier (Callable): A function or model that takes a text input and
            returns a list of dictionaries containing classification labels and scores.
        tokenizer (Callable): A tokenizer function or model that tokenizes the input
            text and provides token IDs.
        max_len (int, optional): The maximum length of each chunk in tokens. Defaults to 512.
        stride (int, optional): The number of tokens to overlap between consecutive chunks.
            Defaults to 50.

    Returns:
        dict: A dictionary where keys are classification labels and values are the
        average scores for each label across all chunks.
    """
    # tokenize entire doc once
    tokens = tokenizer(text, return_tensors="pt")["input_ids"][0]
    chunks = []
    for i in range(0, tokens.size(0), max_len - stride):
        chunk_ids = tokens[i : i + max_len]
        chunks.append(tokenizer.decode(chunk_ids, skip_special_tokens=True))
        if i + max_len >= tokens.size(0):
            break

    # classify each chunk
    chunk_scores = []
    for chunk in chunks:
        scores = classifier(chunk)[0]  # list of {label, score}
        chunk_scores.append({d["label"]: d["score"] for d in scores})

    # average scores per label
    avg_scores = {
        label: sum(s[label] for s in chunk_scores) / len(chunk_scores)
        for label in chunk_scores[0]
    }
    return avg_scores


def main():

    # This initial set of lines defines the command line arguments this
    # program uses

    default_dir = "~/Code/Huggingface-metadata-project/BERTley/checkpoint-3486"
    parser = argparse.ArgumentParser(
        description="Run inference on a trained BERT metadata classifier"
    )
    parser.add_argument(
        "--model_dir",
        type=str,
        default=default_dir,
        help="Directory where your trained model and config live",
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--text", type=str, help="Raw text string to classify")
    group.add_argument(
        "--input_file",
        type=str,
        help="Path to a .txt file containing the document to classify",
    )
    args = parser.parse_args()

    # 1) Load tokenizer + model (config.json should have the id2label/label2id baked in
    # thru training script)
    tokenizer = AutoTokenizer.from_pretrained(args.model_dir)
    model = AutoModelForSequenceClassification.from_pretrained(args.model_dir)

    # 2) Build the pipeline...
    classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        return_all_scores=True,
    )

    # 3) Read your document
    if args.input_file:
        text = open(args.input_file, "r", encoding="utf-8").read()
    else:
        text = args.text

    # If it’s longer than 512 tokens, needs to be chunked + classified
    # otherwise single call
    tokens = tokenizer(text, return_tensors="pt")["input_ids"]
    if tokens.size(1) <= 512:
        result = classifier(text)[0]
        scores = {d["label"]: d["score"] for d in result}
    else:
        scores = chunk_and_classify(text, classifier, tokenizer)

    # print scores
    print(json.dumps(scores, indent=2))


if __name__ == "__main__":
    main()