propositionizer-wiki-flan-t5-large

File size: 3,824 Bytes

20efac6
5c4dcc9
 
da0780a
5c4dcc9
 
 
 
 
 
 
e229a67
5c4dcc9
da0780a
5c4dcc9
20efac6
 
 
5c4dcc9
da0780a
5c4dcc9
 
da0780a
 
 
5c4dcc9
 
da0780a
5c4dcc9
da0780a
 
 
 
5c4dcc9
da0780a
c584727
e229a67
c584727
 
5c4dcc9
 
3cc9791
 
 
 
 
 
 
 
 
 
 
 
 
d6978ab
 
 
 
 
 
 
da0780a
3cc9791
 
 
 
 
 
 
 
 
5c4dcc9
20efac6
5c4dcc9
 
 
 
da0780a
5c4dcc9
 
20efac6
5c4dcc9
20efac6
5c4dcc9
20efac6

from typing import Dict, Any, List
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from torch.cuda.amp import autocast


class EndpointHandler:
    def __init__(self, path="chentong00/propositionizer-wiki-flan-t5-large"):
        """
        Initialize the handler by loading the model, tokenizer, and setting the device.
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(path)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(path).to(self.device).half()

    def process_chunks(
        self, chunks: List[str], titles: List[str], dates: List[str]
    ) -> List[str]:
        """
        Process multiple text chunks with the model.

        Args:
            chunks (list): List of text content to process.
            titles (list): List of document titles corresponding to the chunks.
            dates (list): List of document dates corresponding to the chunks.

        Returns:
            list: List of generated output texts.
        """
        input_texts = [
            f"Title: {t}. Date: {d}. Content: {c}"
            for c, t, d in zip(chunks, titles, dates)
        ]
        input_ids = self.tokenizer(
            input_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024,
        ).input_ids.to(self.device)

        try:
            with torch.no_grad():
                # Use autocast for mixed precision on CUDA devices
                if self.device.type == "cuda":
                    with autocast():
                        outputs = self.model.generate(
                            input_ids,
                            max_new_tokens=512,
                            no_repeat_ngram_size=5,
                            length_penalty=1.2,
                            num_beams=5,
                        )
                else:
                    outputs = self.model.generate(
                        input_ids,
                        max_new_tokens=512,
                        no_repeat_ngram_size=5,
                        length_penalty=1.2,
                        num_beams=5,
                    )

            predictions = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
        finally:
            # Explicit memory cleanup
            del input_ids, outputs
            torch.cuda.empty_cache()
            if self.device.type == "cuda":
                torch.cuda.synchronize()

        return predictions

    def __call__(self, data: Dict[str, Any]) -> Dict[str, List[Dict[str, Any]]]:
        """
        Handle the inference request.

        Args:
            data (dict): The payload with text inputs.

        Returns:
            dict: The processed outputs containing the generated text for each input along with their IDs.
        """
        inputs = data.get("inputs", [])

        # Ensure inputs is a list of dictionaries
        if not isinstance(inputs, list) or not all(isinstance(i, dict) for i in inputs):
            raise ValueError("The inputs must be a list of dictionaries.")

        chunks, titles, dates, ids = [], [], [], []
        for item in inputs:
            for key in ["id", "chunk", "title", "date"]:
                if key not in item:
                    raise ValueError(f"Each input must contain the key: {key}.")
            ids.append(item["id"])
            chunks.append(item["chunk"])
            titles.append(item["title"])
            dates.append(item["date"])

        predictions = self.process_chunks(chunks, titles, dates)
        result = [
            {"id": id_, "generated_text": prediction}
            for id_, prediction in zip(ids, predictions)
        ]
        return {"results": result}