Spaces:

BengaliNLP
/

bertspace

Runtime error

App Files Files Community

Start-GPT commited on Jun 12, 2024

Commit

b71e56d

verified ·

1 Parent(s): 6659220

Create model_api.py

Browse files

Files changed (1) hide show

server/model_api.py +153 -0

server/model_api.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from typing import List, Union, Tuple
+import torch
+from transformers import AutoConfig, AutoTokenizer, AutoModelWithLMHead, AutoModel
+from transformer_formatter import TransformerOutputFormatter
+from utils.f import delegates, pick, memoize
+@memoize
+def get_details(mname):
+    return ModelDetails(mname)
+def get_model_tok(mname):
+    conf = AutoConfig.from_pretrained(mname, output_attentions=True, output_past=False)
+    tok = AutoTokenizer.from_pretrained(mname, config=conf)
+    model = AutoModelWithLMHead.from_pretrained(mname, config=conf)
+    return model, tok
+class ModelDetails:
+    """Wraps a transformer model and tokenizer to prepare inputs to the frontend visualization"""
+    def __init__(self, mname):
+        self.mname = mname
+        self.model, self.tok = get_model_tok(self.mname)
+        self.model.eval()
+        self.config = self.model.config
+    def from_sentence(self, sentence: str) -> TransformerOutputFormatter:
+        """Get attentions and word probabilities from a sentence. Special tokens are automatically added if a sentence is passed.
+        Args:
+            sentence: The input sentence to tokenize and analyze.
+        """
+        tokens = self.tok.tokenize(sentence)
+        return self.from_tokens(tokens, sentence, add_special_tokens=True)
+    def from_tokens(
+        self, tokens: List[str], orig_sentence:str, add_special_tokens:bool=False, mask_attentions:bool=False, topk:int=5
+    ) -> TransformerOutputFormatter:
+        """Get formatted attention and predictions from a list of tokens.
+        Args:
+            tokens: Tokens to analyze
+            orig_sentence: The sentence the tokens came from (needed to help organize the output)
+            add_special_tokens: Whether to add special tokens like CLS / <|endoftext|> to the tokens.
+                If False, assume the tokens already have the special tokens
+            mask_attentions: If True, do not pay attention to attention patterns to special tokens through the model.
+            topk: How many top predictions to report
+        """
+        ids = self.tok.convert_tokens_to_ids(tokens)
+        # For GPT2, add the beginning of sentence token to the input. Note that this will work on all models but XLM
+        bost = self.tok.bos_token_id
+        clst = self.tok.cls_token_id
+        sept = self.tok.sep_token_id
+        if (bost is not None) and (bost != clst)and add_special_tokens:
+            ids.insert(0, bost)
+        inputs = self.tok.prepare_for_model(ids, add_special_tokens=add_special_tokens, return_tensors="pt")
+        parsed_input = self.parse_inputs(inputs, mask_attentions=mask_attentions)
+        output = self.model(parsed_input['input_ids'], attention_mask=parsed_input['attention_mask'])
+        logits, atts = self.choose_logits_att(output)
+        words, probs = self.logits2words(logits, topk)
+        tokens = self.view_ids(inputs["input_ids"])
+        formatted_output = TransformerOutputFormatter(
+            orig_sentence,
+            tokens,
+            inputs["special_tokens_mask"],
+            atts,
+            words,
+            probs.tolist(),
+            self.config
+        )
+        return formatted_output
+    def choose_logits_att(self, out:Tuple) -> Tuple:
+        """Select from the model's output the logits and the attentions, switching on model name
+        Args:
+            out: Output from the model's forward pass
+        Returns:
+            (logits: tensor((bs, N)), attentions: Tuple[tensor(())])
+        """
+        if 't5' in self.mname:
+            logits, _, atts = out
+        else:
+            logits, atts = out
+        return logits, atts
+    def logits2words(self, logits, topk):
+        """Convert logit probabilities into words from the tokenizer's vocabulary.
+        """
+        probs, idxs = torch.topk(torch.softmax(logits.squeeze(0), 1), topk)
+        words = [self.tok.convert_ids_to_tokens(i) for i in idxs]
+        return words, probs
+    def view_ids(self, ids: Union[List[int], torch.Tensor]) -> List[str]:
+        """View what the tokenizer thinks certain ids are for a single input"""
+        if type(ids) == torch.Tensor:
+            # Remove batch dimension
+            ids = ids.squeeze(0).tolist()
+        out = self.tok.convert_ids_to_tokens(ids)
+        return out
+    def parse_inputs(self, inputs, mask_attentions=False):
+        """Parse the output from `tokenizer.prepare_for_model` to the desired attention mask from special tokens
+        Args:
+            - inputs: The output of `tokenizer.prepare_for_model`.
+                A dict with keys: {'special_token_mask', 'token_type_ids', 'input_ids'}
+            - mask_attentions: Flag indicating whether to mask the attentions or not
+        Returns:
+            Dict with keys: {'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'}
+        Usage:
+            ```
+            s = "test sentence"
+            # from raw sentence to tokens
+            tokens = tokenizer.tokenize(s)
+            # From tokens to ids
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            # From ids to input
+            inputs = tokenizer.prepare_for_model(ids, return_tensors='pt')
+            # Parse the input. Optionally mask the special tokens from the analysis.
+            parsed_input = parse_inputs(inputs)
+            # Run the model, pick from this output whatever inputs you want
+            from utils.f import pick
+            out = model(**pick(['input_ids'], parse_inputs(inputs)))
+            ```
+        """
+        out = inputs.copy()
+        # DEFINE SPECIAL TOKENS MASK
+        if "special_tokens_mask" not in inputs.keys():
+            special_tokens = set([self.tok.unk_token_id, self.tok.cls_token_id, self.tok.sep_token_id, self.tok.bos_token_id, self.tok.eos_token_id, self.tok.pad_token_id])
+            in_ids = inputs['input_ids'][0]
+            special_tok_mask = [1 if int(i) in special_tokens else 0 for i in in_ids]
+            inputs['special_tokens_mask'] = special_tok_mask
+        if mask_attentions:
+            out["attention_mask"] = torch.tensor(
+                [int(not i) for i in inputs.get("special_tokens_mask")]
+            ).unsqueeze(0)
+        else:
+            out["attention_mask"] = torch.tensor(
+                [1 for i in inputs.get("special_tokens_mask")]
+            ).unsqueeze(0)
+        return out