krotima1
/

AlignScoreCS

@@ -6,32 +6,37 @@ import numpy as np
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import torch.nn as nn
 import torch
-# This include should be add when using different AlignScoreFunction methods instead of score()
-# from nltk.tokenize import sent_tokenize
 from tqdm import tqdm
 class AlignScoreCS(transformers.XLMRobertaModel):
     """
-    ALIGNSCORE class
     Description:
         Model ALIGNSCORECS has been trained according the paper for 3 days on 4GPUs AMD NVIDIA.
         (3 epochs, 1e-5 learning rate, 1e-6 AdamWeps, batchsize 32, WarmupRatio 0.06, 0.1 WeighDecay)
-        - XLMROBERTA-base model with 3 classification HEAD {regression,binary,3way} using shared encoder
     USAGE: AlignScore.py
-        - from_pretrained - loads the model, usage as transformers.model
         - .score(context, claim) - function
                 - returns probs of the ALIGNED class using 3way class head as in the paper.
         alignScoreCS = AlignScoreCS.from_pretrained("/mnt/data/factcheck/AlignScore-data/AAmodel/MTLModel/mo
         alignScoreCS.score(context,claim)
         If you want to try different classification head use parameter:
             - task_name = "re" : regression head
             - task_name = "bin" : binary classification head
             - task_name = "3way" : 3way classification head
     """
     _regression_model = "re_model"
     _binary_class_model = "bin_model"
@@ -41,324 +46,74 @@ class AlignScoreCS(transformers.XLMRobertaModel):
         super().__init__(transformers.XLMRobertaConfig(), **kwargs)
         self.encoder = encoder
         self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
-        self.tokenizer = None
         self.model_name =  model_name
         self.inferencer = None
     def init_inferencer(self, device = "cuda"):
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) if not self.tokenizer else self.tokenizer
         self.inferencer = self.InferenceHandler(self, self.tokenizer, device)
     """
-        Score: scores the context and claim with Aligned probabitlity of 3way classification head
-         - using paper code inferencer from ALignScore
     """
-    def score(self, context, claim, **kwargs):
-        if self.inferencer is None:
-            self.init_inferencer()
-        scores = self.inferencer.nlg_eval(context, claim)
         return scores
     """
-        Score: scores the context and claim with ALIGNED probability (wrt task_name ["re" | "bin" | "3way"])
-        Returns the probability of the ALIGNED CLASS between context text and claim text
-         - chunks text by 350 tokens and splits claim into sentences
-         - using 3way classification head
-    """
-    def score_sentences(self, context :str, claim :str, task_name = "3way", batch_size = 2, return_all_outputs = False, **kwargs):
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) if not self.tokenizer else self.tokenizer
-        chunked_inputs = self.chunk_sent_input(context,claim, chunk_size=350,chunk_claim_size=150)
-        nclaims, ncontexts = (chunked_inputs["n_claims"],chunked_inputs["n_contexts"])
-        with torch.no_grad():
-            chunked_inputs = {key : torch.tensor(item).to(self.device) for key, item in chunked_inputs.items() if not key.startswith("n_")}
-            chunked_outputs = {}
-            for i in range(0,len(chunked_inputs["input_ids"]),batch_size):
-                tmp = self.forward(task_name = task_name,**{"input_ids":chunked_inputs["input_ids"][i:i+batch_size],"attention_mask" :chunked_inputs["attention_mask"][i:i+batch_size]}, **kwargs)
-                for k, item in tmp.items():
-                    chunked_outputs[k] = chunked_outputs.get(k, []) + [item]
-            logits = torch.vstack(chunked_outputs["logits"]).cpu()
-            outputs = {"score" : self.alignscore_input(logits,nclaims=nclaims,ncontexts=ncontexts, task_name=task_name)}
-            outputs["outputs"] = chunked_outputs
-        return torch.tensor([outputs["score"]]) if not return_all_outputs else outputs
-    """
-        Score: scores the context and claim with ALIGNED probability (wrt task_name ["re" | "bin" | "3way"])
-        Returns the probability of the ALIGNED CLASS between context text and claim text
-         - chunks text into 350 tolens and chunks claim into 150 tokens
-         - using 3way classification head
-    """
-    def score_chunks(self, context :str, claim :str, task_name = "3way", batch_size = 2, return_all_outputs = False, **kwargs):
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) if not self.tokenizer else self.tokenizer
-        chunked_inputs = self.chunk_inputs(context,claim, chunk_size=350)
-        chunked_inputs = {key : torch.tensor(item).to(self.device) for key, item in chunked_inputs.items()}
-        chunked_outputs = self.forward(task_name = task_name, **chunked_inputs, **kwargs)
-        outputs = {"score" : self.alignscore_input_deprecated(chunked_outputs.logits.cpu(), task_name=task_name)}
-        outputs["outputs"] = chunked_outputs
-        return outputs["score"] if not return_all_outputs else outputs
-    """
-        Classify: classify the context and claim to the class label given the task_name ["re" | "bin" | "3way"]
-        Returns the class of {Neutral, contradict, aligned} between context text and claim text
-         - using 3way classification head
     """
-    def classify(self, context :str, claim :str, task_name = "3way", return_all_outputs = False, **kwargs):
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) if not self.tokenizer else self.tokenizer
-        chunked_inputs = self.chunk_inputs(context,claim, chunk_size=350)
-        chunked_inputs = {key : torch.tensor(item).to(self.device) for key, item in chunked_inputs.items()}
-        chunked_outputs = self.forward(task_name = task_name, **chunked_inputs, **kwargs)
-        outputs = {"class" : self.get_system_label(chunked_outputs.logits.cpu(), task_name=task_name)}
-        outputs["outputs"] = chunked_outputs
-        return outputs["class"] if not return_all_outputs else outputs
-    def score_truncated(self, context :str, claim :str, task_name = "3way",  return_all_outputs = False, **kwargs):
-        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) if not self.tokenizer else self.tokenizer
-        tokenized_inputs = self.tokenizer(list(zip([context], [claim])), padding = "max_length", truncation = True, max_length = 512, return_tensors="pt")
-        tokenized_inputs = {key : torch.tensor(item).to(self.device) for key, item in tokenized_inputs.items()}
-        with torch.no_grad():
-            model_outputs = self.forward(task_name=task_name, **tokenized_inputs, **kwargs)
-            outputs = {"score" : self.alignscore_input(model_outputs["logits"].cpu(),nclaims=1, ncontexts=1, task_name=task_name)}
-            outputs["outputs"] = model_outputs
-        return torch.tensor([outputs["score"]]) if not return_all_outputs else outputs
     def forward(self, task_name = "3way", **kwargs):
         return self.taskmodels_dict[task_name](**kwargs)
     def __call__(self, task_name, **kwargs):
         return self.taskmodels_dict[task_name](**kwargs)
-    """
-        Get the probability of the ALIGNED label from input
-    """
-    def alignscore_input(self, chunked_logits, nclaims, ncontexts, task_name = "3way"):
-        if task_name == "re":
-            ouptuts = chunked_logits.detach()
-            # Reshape the tensor to separate each block of n rows
-            reshaped_tensor = ouptuts.view(nclaims, ncontexts)
-            # Extract the maximum values from the first column (index 0) within each block of n rows
-            max_values, _ = reshaped_tensor.max(dim=1)
-            # Calculate the mean of the max values for each block of n rows
-            mean_of_maxes = torch.mean(max_values, dim=0)
-            return mean_of_maxes.tolist()
-        else:
-            nlabels = {"3way" : 3, "re" : 1, "2way" : 2}[task_name]
-            ouptuts = chunked_logits.softmax(1).detach()
-            # Reshape the tensor to separate each block of n rows
-            reshaped_tensor = ouptuts.view(nclaims, ncontexts, nlabels)
-            # Extract the maximum values from the first column (index 0) within each block of n rows
-            max_values, _ = torch.max(reshaped_tensor[:, :, 1], dim=1)
-            # Calculate the mean of the max values for each block of n rows
-            mean_of_maxes = torch.mean(max_values, dim=0)
-            return mean_of_maxes.tolist()
-    def alignscore_input_deprecated(self, chunked_logits, task_name = "3way"):
-        if task_name == "re":
-            return chunked_logits.detach().amax(0).tolist()
-        else:
-            return chunked_logits.softmax(1).detach()[:, 1].amax(0).tolist() # return max probability over the ALIGNED class
-    """
-        get the label from the input
-    """
-    def get_system_label(self, chunked_logits, task_name):
-        if task_name == "re":
-            return (chunked_logits.sum(0) / chunked_logits.size()[0]).detach().tolist()
-        else:
-            avg_probs = chunked_logits.softmax(1).sum(0) / chunked_logits.size()[0]
-            numpy_array = chunked_logits.softmax(1).argmax(1).detach().numpy()
-            # Calculate the frequencies of each value
-            unique_values, counts = np.unique(numpy_array, return_counts=True)
-            # Find the maximum count
-            max_count = np.max(counts)
-            # Find all values with the maximum count
-            most_frequent_values = unique_values[counts == max_count]
-            return most_frequent_values[0] if most_frequent_values.size == 1 else avg_probs.detach().argmax().tolist()
-    """
-        Chunks input context and claim  - context is chunked into 350 tokens
-                                        - claim is chunked into sentences
-                                        - using stride for overflowing tokens
-    """
-    def chunk_sent_input(self, context, claim, max_length = 512, chunk_size = 350, chunk_claim_size = 150):
-        assert chunk_size <= max_length, "Chunk size {} cannot be greater than max size {}".format(chunk_size, chunk_claim_size, max_length)
-        chunk_claim_size = max_length - chunk_size if chunk_claim_size is None else chunk_claim_size
-        assert chunk_size + chunk_claim_size <= max_length, "Chunk size {} and Chunk claim size {} cannot be together greater than max size {}".format(chunk_size, chunk_claim_size, max_length)
-        return_chunked_inputs = {}
-        context_chunks = self.chunk_text(context, chunk_size=chunk_size, overflowing_tokens_stride = 25, first_special_token=[0])
-        claim_chunks = self.chunk_sentences(claim, chunk_size=chunk_claim_size,overflowing_tokens_stride=int(chunk_claim_size/3), first_special_token=[2])
-        for claim_chunk in claim_chunks:
-            for context_chunk in context_chunks:
-                inputs,attention =self.fill_with_pad_tokens(context_chunk,claim_chunk )
-                return_chunked_inputs["input_ids"] = return_chunked_inputs.get("input_ids",[]) + [inputs]
-                return_chunked_inputs["attention_mask"] = return_chunked_inputs.get("attention_mask",[]) + [attention]
-        return_chunked_inputs["n_claims"] = len(claim_chunks)
-        return_chunked_inputs["n_contexts"] = len(context_chunks)
-        return return_chunked_inputs
-    """
-        According to paper - chunk the text into smaller parts (350tokens + claim_tokens) when the tokenized inputs exceed the max_length
-        returns chunked input
-    """
-    def chunk_inputs(self, context, claim, max_length = 512, chunk_size = 512, first_fit_within_max_length = True):
-        assert chunk_size <= max_length, "Chunk size {} cannot be greater than max size {}".format(chunk_size, max_length)
-        tokenized_claim = self.tokenizer(claim, return_length=True)
-        tokenized_claim["input_ids"][0] = 2 # </s> token according to pair tokenization where the separator of the context and claim is </s></s>
-        tokenized_context = self.tokenizer(context, return_length = True)
-        assert tokenized_claim["length"][0] < max_length*4/5, "Create chunks of claim sentences. Claim is too long {} which is more than 4/5 from {}.".format(tokenized_claim["length"][0], max_length)
-        # set chunk size to incorporate the claim size as well
-        chunk_size = min(max_length, chunk_size + tokenized_claim["length"][0])
-        first_check_max_size = max_length if first_fit_within_max_length else chunk_size
-        if tokenized_claim["length"][0] + tokenized_context["length"][0] <= first_check_max_size: #if it fits within max_length
-            input_ids, attention_mask = self.fill_with_pad_tokens(tokenized_context["input_ids"],tokenized_claim["input_ids"])
-            return {"input_ids" : [input_ids], "attention_mask" : [attention_mask]}
-        else: # make chunks
-            return_chunked_inputs = {}
-            current_chunk = {}
-            for sentence in sent_tokenize(context, language="czech"):
-                tok_sent = self.tokenizer(sentence, return_length=True)
-                if len(current_chunk.get("input_ids",[0])) + tok_sent["length"][0] - 1  + tokenized_claim["length"][0] <= chunk_size:
-                    current_chunk["input_ids"] = current_chunk.get("input_ids",[0]) + tok_sent["input_ids"][1:-1]
-                else:
-                    return_chunked_inputs = self._update_chunked_inputs(tokenized_claim, current_chunk, return_chunked_inputs, max_length, tok_sent)
-                    current_chunk["input_ids"] = [0] + tok_sent["input_ids"][1:-1]
-            if current_chunk != {}: # add the rest
-                    return_chunked_inputs = self._update_chunked_inputs(tokenized_claim, current_chunk, return_chunked_inputs, max_length)
-                    current_chunk = {}
-            return return_chunked_inputs
-    """
-        Chunks input context and claim  - context is chunked into 350 tokens
-                                        - claim is chunked into 150 tokens
-                                        - using stride for overflowing tokens
-    """
-    def chunk_input_deprecated(self, context, claim, max_length = 512, chunk_size = 350, chunk_claim_size = 150):
-        assert chunk_size <= max_length, "Chunk size {} cannot be greater than max size {}".format(chunk_size, chunk_claim_size, max_length)
-        chunk_claim_size = max_length - chunk_size if chunk_claim_size is None else chunk_claim_size
-        assert chunk_size + chunk_claim_size <= max_length, "Chunk size {} and Chunk claim size {} cannot be together greater than max size {}".format(chunk_size, chunk_claim_size, max_length)
-        return_chunked_inputs = {}
-        context_chunks = self.chunk_text(context, chunk_size=chunk_size, overflowing_tokens_stride = 25, first_special_token=[0])
-        claim_chunks = self.chunk_text(claim, chunk_size=chunk_claim_size,overflowing_tokens_stride=int(chunk_claim_size/3), first_special_token=[2])
-        for claim_chunk in claim_chunks:
-            for context_chunk in context_chunks:
-                inputs,attention =self.fill_with_pad_tokens(context_chunk,claim_chunk )
-                return_chunked_inputs["input_ids"] = return_chunked_inputs.get("input_ids",[]) + [inputs]
-                return_chunked_inputs["attention_mask"] = return_chunked_inputs.get("attention_mask",[]) + [attention]
-        return_chunked_inputs["n_claims"] = len(claim_chunks)
-        return_chunked_inputs["n_contexts"] = len(context_chunks)
-        return return_chunked_inputs
-    """
-        Chunk texts into blocks of chunk_size tokens
-    """
-    def chunk_text(self, text, chunk_size = 350, overflowing_tokens_stride = 25, language="czech", first_special_token = [0]):
-        sentences = sent_tokenize(text, language=language)
-        tokenized = self.tokenizer(sentences if sentences != [] else [""], return_length=True)
-        chunks = []
-        chunk, current_chunk_size = ([], 0)
-        for i, length in enumerate(tokenized["length"]):
-            # WRAP THE TOKENIZED SENTNECE INTO LIST TO HANDLE OVERFLOWING TOKENS EASILY
-            # Case when length of one sentence is longer than the chunk size - split the sentence into chunks of chunk size
-            if length > chunk_size:
-                splits = [first_special_token + tokenized["input_ids"][i][max(1,cs):min(cs + chunk_size - 2, length - 1)] + [2] for cs in range(0, length , chunk_size-(2+overflowing_tokens_stride))]
-            # Case when lenght of sequence is equal or smaller than the chunk size - only continue
-            else:
-                splits = [first_special_token + tokenized["input_ids"][i][1:]]
-            # Go through sentence or splits of sentence
-            for subsentence in splits:
-                up_length = len(subsentence) - 2
-                # Case when the current chunk = 0
-                if current_chunk_size == 0:
-                    current_chunk_size = up_length + 2  # First include <s> and </s>  tokens
-                    chunk = subsentence[:-1]
-                # Case when the current chunk + length of new subsentence <= chunk_size - only add
-                elif current_chunk_size + up_length <= chunk_size:
-                    current_chunk_size += up_length
-                    chunk += subsentence[1:-1]
-                # Case when the current chunk + length of new subsentence > chunk_size - create chunk
-                else:
-                    chunks += [chunk + [2]]
-                    current_chunk_size = up_length + 2  # First include <s> and </s>  tokens
-                    chunk = subsentence[:-1]
-        #Case when the loop ended but the current chunk isnt saved in the chunks
-        if chunk != []:
-            chunks += [chunk + [2]]
-        # lengths = [len(ch) for ch in chunks]
-        # print("Lenght in tokens of ",len(lengths)," chunks (AVG=",np.mean(lengths),",MAX=",np.max(lengths),",MIN=", np.min(lengths),")")
-        return chunks
-    """
-        Chunks text into sentences using nlt.sent_tokenize
-    """
-    def chunk_sentences(self, text, chunk_size, overflowing_tokens_stride = 0, language="czech", sentence_window = 2, first_special_token = [2]):
-        sentences = sent_tokenize(text, language=language)
-        tokenized = self.tokenizer(sentences if sentences != [] else [""], return_length=True)
-        chunks = []
-        current_chunk = []
-        for i, length in enumerate(tokenized["length"]):
-            # WRAP THE TOKENIZED SENTNECE INTO LIST TO HANDLE OVERFLOWING TOKENS EASILY
-            # Case when length of one sentence is longer than the chunk size - split the sentence into chunks of chunk size
-            if length > chunk_size:
-                splits = [first_special_token + tokenized["input_ids"][i][max(1,cs):min(cs + chunk_size - 2, length - 1)] + [2] for cs in range(0, length , chunk_size-(2+overflowing_tokens_stride))]
-            # Case when lenght of sequence is equal or smaller than the chunk size - only continue
-            else:
-                splits = [first_special_token + tokenized["input_ids"][i][1:]]
-            #Go through sentence or parts of sentence and create chunks
-            for split in splits:
-                chunks += [split]
-                # if len(current_chunk) == sentence_window:
-                #     chunks += [first_special_token + [item for row in current_chunk for item in row] + [2]]
-                #     current_chunk = current_chunk[1:] + [split[1:-1]]
-                # else:
-                #     current_chunk += [split[1:-1]]
-        # if chunks == []:
-        #     chunks += [first_special_token + [item for row in current_chunk for item in row] + [2]]
-        return chunks
-    """
-        join context and claim tokens as input_ids and create attention_mask
-    """
-    def fill_with_pad_tokens(self, first, second, max_length=512, pad_token = 1):
-        return first + second + [pad_token]*max(max_length-len(first)-len(second),0), [1]*(len(first)+len(second)) + [0]*max(max_length-len(first)-len(second),0)
-    def _update_chunked_inputs(self, tokenized_claim, current_chunk, return_chunked_inputs, max_length, tok_sent = {"input_ids" : []}):
-        # truncate if there is a long sentence (rare occurrences)
-        if len(current_chunk.get("input_ids",[0])) + tokenized_claim["length"][0] >= max_length:
-            chunk = current_chunk["input_ids"].copy()[:max_length-tokenized_claim["length"][0]-1] + [2]
-        elif not current_chunk.get("input_ids",False):
-            chunk = tok_sent["input_ids"][: max_length - tokenized_claim["length"][0] -1] + [2]
-        else:
-            chunk = current_chunk["input_ids"].copy() + [2] # add </s> end of sentence
-        claim_ids = tokenized_claim["input_ids"].copy()
-        inputs, attention = self.fill_with_pad_tokens(chunk,claim_ids )
-        return_chunked_inputs["input_ids"] = return_chunked_inputs.get("input_ids",[]) + [inputs]
-        return_chunked_inputs["attention_mask"] = return_chunked_inputs.get("attention_mask",[]) + [attention]
-        return return_chunked_inputs
     @classmethod
     def get_encoder_attr_name(cls, model):
         """
@@ -370,13 +125,19 @@ class AlignScoreCS(transformers.XLMRobertaModel):
             return "roberta"
         else:
             raise KeyError(f"Add support for new model {model_class_name}")
     @classmethod
     def from_pretrained(
         cls,
         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
         model_name : str = "xlm-roberta-large",
         *model_args,
         config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
         cache_dir: Optional[Union[str, os.PathLike]] = None,
@@ -388,19 +149,19 @@ class AlignScoreCS(transformers.XLMRobertaModel):
         use_safetensors: bool = None,
         **kwargs,
     ):
-        # Check if the required model directories exist then load it from file
-        if all(os.path.exists(os.path.join(pretrained_model_name_or_path, model_dir)) for model_dir in [cls._3way_class_model, cls._regression_model, cls._binary_class_model]):
-            # assert all(
-            #     for model_dir in [cls._3way_class_model, cls._regression_model, cls._binary_class_model]
-            # ), "Error: Required model directories not found!"
-            # Disable the warning about newly initialized weights
             transformers.logging.set_verbosity_error()
             shared_encoder = None
             taskmodels_dict = {}
-            for path_name in [cls._regression_model, cls._binary_class_model, cls._3way_class_model]:
                 task_name = path_name.split("_")[0]
                 # Load the configuration for the task-specific model
@@ -417,25 +178,56 @@ class AlignScoreCS(transformers.XLMRobertaModel):
                 taskmodels_dict[task_name] = model
             # Create the AlignScoreCS with the shared encoder and loaded task-specific models
-            alignScoreCS = AlignScoreCS(encoder=shared_encoder, taskmodels_dict=taskmodels_dict, model_name=model_name)
-         #Try load the model from huggingface hub
-        else:
             shared_encoder = None
             taskmodels_dict = {}
-            for model_dir in [cls._regression_model, cls._binary_class_model, cls._3way_class_model]:
-                task_name = model_dir.split("_")[0]
-                config = transformers.XLMRobertaConfig.from_pretrained(f"{pretrained_model_name_or_path}", subfolder=model_dir)
-                model = transformers.XLMRobertaForSequenceClassification.from_pretrained(f"{pretrained_model_name_or_path}",config=config, subfolder=model_dir)
                 if shared_encoder is None:
                     shared_encoder = getattr(model, AlignScoreCS.get_encoder_attr_name(model))
                 else:
                     setattr(model, AlignScoreCS.get_encoder_attr_name(model), shared_encoder)
                 taskmodels_dict[task_name] = model
-            alignScoreCS = AlignScoreCS(encoder=shared_encoder, taskmodels_dict=taskmodels_dict, model_name=model_name)
         return alignScoreCS
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
@@ -450,8 +242,11 @@ class AlignScoreCS(transformers.XLMRobertaModel):
         save_peft_format: bool = True,
         **kwargs,
     ):
         for task_name, model_type in self.taskmodels_dict.items():
-            model_type.save_pretrained(save_directory = Path(save_directory,task_name+"_model"),
                                     is_main_process = is_main_process,
                                     state_dict = state_dict,
                                     save_function = save_function,
@@ -463,46 +258,90 @@ class AlignScoreCS(transformers.XLMRobertaModel):
                                     save_peft_format = save_peft_format,
                                     **kwargs)
-# This piece of code is copied from AlignScore github repository
-# if you want to use different nlg_eval_mode you have to fix errors on your own
     class InferenceHandler:
-        def __init__(self, model, tokenizer, device = "cuda"):
             self.model = model
             self.device = device
             self.tokenizer = tokenizer
-            self.model.to(self.device)
             self.model.eval()
-            self.batch_size = 32
-            self.nlg_eval_mode = "nli_sp"
-            self.verbose = False
-            self.task_name = "3way"
             self.softmax = nn.Softmax(dim=-1)
-        def nlg_eval(self, premise, hypo):
             if isinstance(premise, str) and isinstance(hypo, str):
                 premise = [premise]
                 hypo = [hypo]
-            return self.inference_example_batch(premise, hypo)
-        def inference_example_batch(self, premise: list, hypo: list):
             """
             inference a example,
             premise: list
             hypo: list
             using self.inference to batch the process
             SummaC Style aggregation
             """
             self.disable_progress_bar_in_inference = True
             assert len(premise) == len(hypo), "Premise must has the same length with Hypothesis!"
             out_score = []
-            for one_pre, one_hypo in tqdm(zip(premise, hypo), desc="Evaluating", total=len(premise), disable=(not self.verbose)):
-                out_score.append(self.inference_per_example(one_pre, one_hypo))
             return torch.tensor(out_score)
-        def inference_per_example(self, premise:str, hypo: str):
             """
             inference a example,
             premise: string
@@ -522,34 +361,30 @@ class AlignScoreCS(transformers.XLMRobertaModel):
             premise_sents = [each for each in chunks(premise_sents, n_chunk)]
             hypo_sents = sent_tokenize(hypo)
             premise_sent_mat = []
             hypo_sents_mat = []
             for i in range(len(premise_sents)):
                 for j in range(len(hypo_sents)):
                     premise_sent_mat.append(premise_sents[i])
-                    hypo_sents_mat.append(hypo_sents[j])
-            if self.nlg_eval_mode is not None:
-                if self.nlg_eval_mode == 'nli_sp':
-                    output_score = self.inference(premise_sent_mat, hypo_sents_mat)[:,1] ### use NLI head OR ALIGN head
-                output_score = output_score.view(len(premise_sents), len(hypo_sents)).max(dim=0).values.mean().item() ### sum or mean depends on the task/aspect
-                return output_score
             output_score = self.inference(premise_sent_mat, hypo_sents_mat) ### use NLI head OR ALIGN head
-            output_score = output_score.view(len(premise_sents), len(hypo_sents)).max(dim=0).values.mean().item() ### sum or mean depends on the task/aspect
             return output_score
-        def inference(self, premise, hypo, task_name = None):
             """
             inference a list of premise and hypo
             Standard aggregation
             """
-            task_name = self.task_name if task_name is None else task_name
             if isinstance(premise, str) and isinstance(hypo, str):
                 premise = [premise]
                 hypo = [hypo]
@@ -560,28 +395,23 @@ class AlignScoreCS(transformers.XLMRobertaModel):
             for mini_batch in tqdm(batch, desc="Evaluating", disable=not self.verbose or self.disable_progress_bar_in_inference):
                 mini_batch = mini_batch.to(self.device)
                 with torch.no_grad():
-                    model_output = self.model.forward(task_name=task_name, **mini_batch)
                     model_output = model_output.logits
-                    if task_name == "re":
                         model_output = model_output.cpu()
                     else:
                         model_output = self.softmax(model_output).cpu()
-                output_score.append(model_output[:,:])
             output_score = torch.cat(output_score)
-            if self.nlg_eval_mode is not None:
-                if self.nlg_eval_mode == 'nli':
-                    output_score_nli = output_score[:,1]
-                    return output_score_nli
-                elif self.nlg_eval_mode == 'bin':
-                    return output_score
-                elif self.nlg_eval_mode == 'reg':
-                    return output_score
-                else:
-                    ValueError("unrecognized nlg eval mode")
             return output_score
         def batch_tokenize(self, premise, hypo):
@@ -606,29 +436,24 @@ class AlignScoreCS(transformers.XLMRobertaModel):
             """Yield successive n-sized chunks from lst."""
             for i in range(0, len(lst), n):
                 yield lst[i:i + n]
-if __name__ == "__main__":
-    alignScore = AlignScoreCS.from_pretrained("krotima1/AlignScoreCS")
-    alignScore.to("cuda" if torch.cuda.is_available() else "cpu")
-    print("Tomáš miluje Zuzku!", "|",  "Tomáš miluje Petru!",alignScore.score("Tomáš miluje Zuzku!", "Tomáš miluje Petru."))
-    print("Tomáš miluje Zuzku!", "|",  "Tomáš miluje Zuzku!",alignScore.score("Tomáš miluje Zuzku!", "Tomáš miluje Zuzku!"))
-    print("Tomáš miluje Zuzku.", "|",  "Zuzka miluje Tomáše.",alignScore.score("Tomáš miluje Zuzku!", "Zuzka miluje Tomáše."))
-    print("Tomáš miluje Zuzku.", "|",  "Zuzka nemiluje Tomáše.",alignScore.score("Tomáš miluje Zuzku!", "Zuzka nemiluje Tomáše."))
-    print("Tomáš miluje Zuzku.", "|",  "Tomáš nemiluje Zuzku.",alignScore.score("Tomáš miluje Zuzku!", "Tomáš nemiluje Zuzku."))
-    print("Dva chlapi se perou.", "|",  "Je tu bitka.",alignScore.score("Dva chlapi se perou.", "Je tu bitka."))
-    print("Dva chlapi se perou.", "|",  "Je tu láska.",alignScore.score("Dva chlapi se perou.", "Je tu láska."))
-    print("Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta. \n Kdo nechal vystavět katedrálu?", "|", "Byl to Karel.",alignScore.score("Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.\nKdo nechal vystavět katedrálu?", "Byl to Karel."))
-    print("Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta. \n Kdo nechal vystavět katedrálu?", "|", "Byl to Vít.",alignScore.score("Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.\nKdo nechal vystavět katedrálu?", "Byl to Vít."))
-    print("Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta. \n Kdo nechal vystavět katedrálu?", "|", "Byla to katedrála.",alignScore.score("Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.\nKdo nechal vystavět katedrálu?", "Byla to katedrála."))
-    print("Kdo je Karel IV.? Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.", "|", "Je Otec.",alignScore.score("Kdo je Karel IV.? Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.", "Je Otec."))
-    print("Kdo je Karel IV.? Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.", "|", "Je Otec vlasti.",alignScore.score("Kdo je Karel IV.? Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.", "Je Otec vlasti."))
-    print("Kdo je Karel IV.? Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.", "|", "Je katedrála svatého Víta.",alignScore.score("Kdo je Karel IV.? Karel IV. je Otec vlasti. Nechal postavit katedrálu svatého Víta.", "Je katedrála svatého Víta."))
-    print("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "|", "Karkulka utekla vklovi.",alignScore.score("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "Karkulka utekla vklovi."))
-    print("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "|", "Karkulka neutekla vklovi.",alignScore.score("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "Karkulka neutekla vklovi."))
-    print("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "|", "Vlk snědl Karkulku.",alignScore.score("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "Vlk snědl karkulku."))
-    print("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "|", "Vlk nesnědl Karkulku.",alignScore.score("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "Vlk nesnědl karkulku."))
-    print("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "|", "Karkulka snědla vlka.",alignScore.score("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "Karkulka snědla vlka."))
-    print("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "|", "Karkulka dala vlkovi jablko.",alignScore.score("Karkulka šla do lesa. V lese potkala vlka. Vlk ji zkoušel sníst, ale Karkulka se nedala a Vlkovi utekla!", "Karkulka dala vlkovi jablko."))

 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 import torch.nn as nn
 import torch
+from nltk.tokenize import sent_tokenize #If you don't have nltk, you can use another sentence tokenizer
 from tqdm import tqdm
 class AlignScoreCS(transformers.XLMRobertaModel):
     """
+    AlignScoreCS class
     Description:
         Model ALIGNSCORECS has been trained according the paper for 3 days on 4GPUs AMD NVIDIA.
         (3 epochs, 1e-5 learning rate, 1e-6 AdamWeps, batchsize 32, WarmupRatio 0.06, 0.1 WeighDecay)
+        - XLMROBERTA-large model with 3 classification HEAD {regression,binary,3way} using shared encoder
+        - trained on 7M docs incorporating various NLP tasks (QA,STS,Summarization,FactVer,InforRetrievel,NLI,Paraphrase..)
+                - English and Czech translated datasets
+    TRY:  .show_examples() to see some examples
     USAGE: AlignScore.py
+        - .from_pretrained - loads the model, usage as transformers.model
         - .score(context, claim) - function
                 - returns probs of the ALIGNED class using 3way class head as in the paper.
+        - .classify(context, claim) - function
+                - returns predicted class using bin class head as in the paper.
         alignScoreCS = AlignScoreCS.from_pretrained("/mnt/data/factcheck/AlignScore-data/AAmodel/MTLModel/mo
         alignScoreCS.score(context,claim)
         If you want to try different classification head use parameter:
             - task_name = "re" : regression head
             - task_name = "bin" : binary classification head
             - task_name = "3way" : 3way classification head
     """
     _regression_model = "re_model"
     _binary_class_model = "bin_model"
         super().__init__(transformers.XLMRobertaConfig(), **kwargs)
         self.encoder = encoder
         self.taskmodels_dict = nn.ModuleDict(taskmodels_dict)
         self.model_name =  model_name
+        self.tokenizer = None
         self.inferencer = None
+        self.init_inferencer(device = "cpu")
     def init_inferencer(self, device = "cuda"):
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name) if not self.tokenizer else self.tokenizer
         self.inferencer = self.InferenceHandler(self, self.tokenizer, device)
     """
+        Score: scores the context and claim with Aligned probabitlity of given classification head
+              - using altered code inferencer from ALignScore
+            context   : list or str
+            claim     : list or str
+            eval_mode : {nli, bin, re, nli_sp, bin_sp or re_sp}
+                    nli - 3way head
+                    bin - 2way head
+                    re - regression head
+                (sp  - indicates whether to apply alignscore function chunk context and split claim into sentences
+                        otherwise it truncates the text and returns probability of Aligned from that)
+            eval_question_answer : list or str representing question if you want to evaluate context-answer question
+        DEFAULT: nli_sp
+        Returns the consistency score (probability of Aligned class of 3-way head) between context text and claim text
+         - using 2way classification head
     """
+    def score(self, context, claim, eval_mode = "nli_sp", eval_question_answer = None, **kwargs):
+        scores = self.inferencer.nlg_eval(context, claim, eval_mode=eval_mode, question = eval_question_answer)
         return scores
     """
+        Classify: classify the context and claim to the class label given the eval model
+            context   : list or str
+            claim     : list or str
+            eval mode : {nli, bin, re, nli_sp, bin_sp or re_sp}
+                    nli - 3way head
+                    bin - 2way head
+                    re - regression head
+                (sp  - indicates whether to apply alignscore classification function chunk context and split claim into sentences
+                       otherwise it truncates the text and returns predicted class)
+        DEFAULT: bin_sp
+        Returns the class of {Contradict, Aligned} between context text and claim text
+         - using 2way classification head
     """
+    def classify(self, context, claim, eval_mode = "bin_sp", **kwargs):
+        eval_mode = eval_mode+"_cls" if ("cls" not in eval_mode) and ("class" not in eval_mode) else eval_mode
+        scores = self.inferencer.nlg_eval(context, claim, eval_mode=eval_mode)
+        return scores
     def forward(self, task_name = "3way", **kwargs):
         return self.taskmodels_dict[task_name](**kwargs)
     def __call__(self, task_name, **kwargs):
         return self.taskmodels_dict[task_name](**kwargs)
+    def to(self, device, **kwargs):
+        self.init_inferencer(device = device)
+        return super().to(device)
+        return self
     @classmethod
     def get_encoder_attr_name(cls, model):
         """
             return "roberta"
         else:
             raise KeyError(f"Add support for new model {model_class_name}")
+    """
+        pretrained_model_name_or_path    :str "krotima1/AlignScoreCS"       // but it is possible to use another NLI model but specify load_specific_head to 3way
+                                            - path to the directory of AlignScoreCS
+                                            - or pass "build_new" to create new multitask AlignScore architecture.
+        load_specific_head               :str ["re", "bin", "3way"] or None // use this, and it will load only one architecture
+        load_another_model
+    """
     @classmethod
     def from_pretrained(
         cls,
         pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
         model_name : str = "xlm-roberta-large",
+        load_specific_head = None,
         *model_args,
         config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
         cache_dir: Optional[Union[str, os.PathLike]] = None,
         use_safetensors: bool = None,
         **kwargs,
     ):
+        architectures = [cls._3way_class_model, cls._regression_model, cls._binary_class_model] if load_specific_head is None else {"re": [cls._regression_model], "bin": [cls._binary_class_model], "3way" : [cls._3way_class_model]}[load_specific_head]
+        is_3way_included = "3way" == load_specific_head or load_specific_head is None
+        # DEPRECATED = it is here only because of loading previous versions... load from file
+        if all(os.path.exists(os.path.join(pretrained_model_name_or_path, model_dir)) for model_dir in architectures):
+            # Disables the warning
             transformers.logging.set_verbosity_error()
             shared_encoder = None
             taskmodels_dict = {}
+            for path_name in tqdm(architectures,  desc='DEPRECATED: Loading architectures from a local directory'):
                 task_name = path_name.split("_")[0]
                 # Load the configuration for the task-specific model
                 taskmodels_dict[task_name] = model
             # Create the AlignScoreCS with the shared encoder and loaded task-specific models
+            return AlignScoreCS(encoder=shared_encoder, taskmodels_dict=taskmodels_dict, model_name=model_name)
+        # UP TO DATE LOADING FROM FILE:
+        if all(os.path.exists(os.path.join(pretrained_model_name_or_path, {"3way_model" : "pytorch_model.bin"}.get(model_dir,model_dir))) for model_dir in architectures):
             shared_encoder = None
             taskmodels_dict = {}
+            for path_name in tqdm(architectures,  desc='Loading architectures from a local directory'):
+                task_name = path_name.split("_")[0]
+                load_path = pretrained_model_name_or_path if task_name == "3way" else "{}/{}".format(pretrained_model_name_or_path,path_name)
+                task_config = transformers.XLMRobertaConfig.from_json_file("{}/config.json".format(load_path))
+                model = transformers.XLMRobertaForSequenceClassification.from_pretrained("{}".format(load_path), config=task_config,*model_args,**kwargs)
                 if shared_encoder is None:
                     shared_encoder = getattr(model, AlignScoreCS.get_encoder_attr_name(model))
                 else:
                     setattr(model, AlignScoreCS.get_encoder_attr_name(model), shared_encoder)
                 taskmodels_dict[task_name] = model
+            return AlignScoreCS(encoder=shared_encoder, taskmodels_dict=taskmodels_dict, model_name=model_name)
+        # BUILD NEW AlignScoreCS
+        if pretrained_model_name_or_path == "build_new":
+            shared_encoder = None
+            taskmodels_dict = {}
+            for path_name in tqdm([cls._3way_class_model, cls._regression_model, cls._binary_class_model],  desc=f'Building new architectures from {model_name}'):
+                task_name = path_name.split("_")[0]
+                task_config = transformers.XLMRobertaConfig.from_pretrained(model_name)
+                model = transformers.XLMRobertaForSequenceClassification.from_pretrained(model_name, config=task_config,*model_args,**kwargs)
+                if shared_encoder is None:
+                    shared_encoder = getattr(model, AlignScoreCS.get_encoder_attr_name(model))
+                else:
+                    setattr(model, AlignScoreCS.get_encoder_attr_name(model), shared_encoder)
+                taskmodels_dict[task_name] = model
+            return AlignScoreCS(encoder=shared_encoder, taskmodels_dict=taskmodels_dict, model_name=model_name)
+        #LOADING FROM HUGGINGFACE HUB
+        shared_encoder = None
+        taskmodels_dict = {}
+        for model_dir in tqdm(architectures,  desc='Loading from huggingface HUB'):
+            task_name = model_dir.split("_")[0]
+            subfolder = '' if task_name == "3way" else model_dir
+            config = transformers.XLMRobertaConfig.from_pretrained(f"{pretrained_model_name_or_path}", subfolder=subfolder)
+            model = transformers.XLMRobertaForSequenceClassification.from_pretrained(f"{pretrained_model_name_or_path}",config=config, subfolder=subfolder)
+            if shared_encoder is None:
+                shared_encoder = getattr(model, AlignScoreCS.get_encoder_attr_name(model))
+            else:
+                setattr(model, AlignScoreCS.get_encoder_attr_name(model), shared_encoder)
+            taskmodels_dict[task_name] = model
+        alignScoreCS = AlignScoreCS(encoder=shared_encoder, taskmodels_dict=taskmodels_dict, model_name=model_name)
         return alignScoreCS
+    """
+        This saves the architectures into the directory. Model with 3way head is in the main dir, while bin and reg are in subfolders (bin_model, re_model).
+    """
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
         save_peft_format: bool = True,
         **kwargs,
     ):
+        #It would be awesome to rewrite this to save only the classifier's head from taskmodels_dict and one encoder instead of 3x same encoder.
+        #But who cares - only those who need save storage
         for task_name, model_type in self.taskmodels_dict.items():
+            final_directory = Path(save_directory,task_name+"_model") if task_name in ["re", "bin"] else  Path(save_directory)
+            model_type.save_pretrained(save_directory = final_directory,
                                     is_main_process = is_main_process,
                                     state_dict = state_dict,
                                     save_function = save_function,
                                     save_peft_format = save_peft_format,
                                     **kwargs)
+    """
+    This piece of code is copied and modified from AlignScore github code from: https://github.com/yuh-zha/AlignScore/blob/main/src/alignscore/inference.py
+    ### eval_mode ####
+         ## sp ##     means chunk context to roughly 300 tokens and split claim into sentneces then apply AlignScore function to get consistency score
+          - nli_sp      - ## DEFAULT ## returns consistency score of Aligned class from 3way head given context and claim using chunking
+          - bin_sp      - returns consistency score of Aligned class from 2way head given context and claim using chunking
+          - re_sp       - returns output of regression head given context and claim using chunking.
+         ## simple ##  without chunking and splitting
+          - nli         - returns probability of Aligned class from 3way head given context and claim.
+          - bin         - returns probability of Aligned class from 2way head given context and claim.
+          - re          - returns output of regression head given context and claim.
+         ## sp_cls ##     means chunk context to roughly 300 tokens and split claim into sentneces then apply AlignScore Classification function to get class
+          - nli_sp_cls      - returns class from 3way head given context and claim using chunking
+          - bin_sp_cls      - returns class from 2way head given context and claim using chunking
+          - re_sp_cls       - returns output from regression head given context and claim using chunking
+         ## simple ##   without chunking and splitting
+          - nli_cls         - returns class of Aligned class from 3way head given context and claim.
+          - bin_cls         - returns class from 2way head given context and claim.
+          - re_cls          - returns output of regression head given context and claim.
+    """
     class InferenceHandler:
+        def __init__(self, model, tokenizer, device = "cuda", batch_size = 32, verbose = False):
+            # self.position_of_aligned_class = {"3way" : 1, "bin" : 1}
+            self.input_evalmode_handler = {"3way_sp" : "nli_sp", "3way_sp_class" : "nli_sp_cls", "3way" : "nli", "3-way" : "nli", "3way_class" : "nli_cls",
+                                 "2way_sp" : "bin_sp", "2way_sp_class" : "bin_sp_cls", "2way" : "bin", "2-way" : "bin", "2way_class" : "bin_cls",
+                                 "reg_sp" : "re_sp", "reg_sp_class" : "re_sp_cls", "reg" : "re", "reg_class" : "re_cls"}
+            self.taskname_handler = lambda eval_mode: "3way" if "nli" in eval_mode else ("bin" if "bin" in eval_mode else "re")
+            #DEFAULT
+            self.nlg_eval_mode = "nli_sp"
+            self.task_name = "3way"
+            #Model setup
             self.model = model
             self.device = device
             self.tokenizer = tokenizer
+            # self.model.to(self.device)
             self.model.eval()
+            self.batch_size = batch_size
+            self.verbose = verbose
             self.softmax = nn.Softmax(dim=-1)
+        def nlg_eval(self, premise, hypo, eval_mode = "nli_sp", question = None):
             if isinstance(premise, str) and isinstance(hypo, str):
                 premise = [premise]
                 hypo = [hypo]
+                if (isinstance(question,str)):
+                    question = [question]
+            if question is None:
+                question = [None]*len(premise)
+            #setup
+            self.nlg_eval_mode = self.input_evalmode_handler.get(eval_mode, eval_mode)
+            self.task_name = self.taskname_handler(self.nlg_eval_mode)
+            assert self.nlg_eval_mode in set(self.input_evalmode_handler.values()), f"eval_mode is wrong {self.nlg_eval_mode}, use please : nli_sp or any other, look at the comments."
+            if "sp" in self.nlg_eval_mode:
+                return self.inference_example_batch(premise, hypo, question)
+            elif "sp" not in self.nlg_eval_mode:
+                return self.inference(premise, hypo)
+            return None
+        def inference_example_batch(self, premise: list, hypo: list, question : list):
             """
             inference a example,
             premise: list
             hypo: list
             using self.inference to batch the process
             SummaC Style aggregation
             """
             self.disable_progress_bar_in_inference = True
             assert len(premise) == len(hypo), "Premise must has the same length with Hypothesis!"
             out_score = []
+            for one_pre, one_hypo, one_quest in tqdm(zip(premise, hypo, question), desc="Evaluating", total=len(premise), disable=(not self.verbose)):
+                out_score.append(self.inference_per_example(one_pre, one_hypo, one_quest))
             return torch.tensor(out_score)
+        def inference_per_example(self, premise:str, hypo: str, quest = None):
             """
             inference a example,
             premise: string
             premise_sents = [each for each in chunks(premise_sents, n_chunk)]
             hypo_sents = sent_tokenize(hypo)
+            #add question to each sentence
+            if quest is not None:
+                hypo_sents = [quest+" "+ sent for sent in hypo_sents]
             premise_sent_mat = []
             hypo_sents_mat = []
             for i in range(len(premise_sents)):
                 for j in range(len(hypo_sents)):
                     premise_sent_mat.append(premise_sents[i])
+                    hypo_sents_mat.append(hypo_sents[j])
             output_score = self.inference(premise_sent_mat, hypo_sents_mat) ### use NLI head OR ALIGN head
+            if "cls" in self.nlg_eval_mode:
+                output_score = output_score.view(len(premise_sents), len(hypo_sents),-1).mean(1).mean(0).argmax().item()
+            else:
+                output_score = output_score.view(len(premise_sents), len(hypo_sents)).max(dim=0).values.mean().item() ### sum or mean depends on the task/aspect
             return output_score
+        def inference(self, premise, hypo):
             """
             inference a list of premise and hypo
             Standard aggregation
             """
             if isinstance(premise, str) and isinstance(hypo, str):
                 premise = [premise]
                 hypo = [hypo]
             for mini_batch in tqdm(batch, desc="Evaluating", disable=not self.verbose or self.disable_progress_bar_in_inference):
                 mini_batch = mini_batch.to(self.device)
                 with torch.no_grad():
+                    model_output = self.model.forward(task_name=self.task_name, **mini_batch)
                     model_output = model_output.logits
+                    if self.task_name == "re":
                         model_output = model_output.cpu()
+                        model_output = model_output[:,0]
                     else:
                         model_output = self.softmax(model_output).cpu()
+                        if "cls" in self.nlg_eval_mode:
+                            model_output = model_output
+                            if "sp" not in self.nlg_eval_mode:
+                                model_output = model_output.argmax(-1)
+                        else:
+                            model_output = model_output[:,1]
+                output_score.append(model_output)
             output_score = torch.cat(output_score)
             return output_score
         def batch_tokenize(self, premise, hypo):
             """Yield successive n-sized chunks from lst."""
             for i in range(0, len(lst), n):
                 yield lst[i:i + n]
+    def show_examples(self):
+        self.to("cuda" if torch.cuda.is_available() else "cpu")
+        contexts = ["Jaromír Jágr (68) střelil poslední gól sezóny do branky Dominika Haška. Davy šílely dokonce i po celém zápase."]
+        claims = ["Dav šílel, když Jarda (68) střelil gól.", "Dav šílel, když Jarda (78) střelil gól.", "Dav šílel jen při zápase, když Jarda (68) střelil gól.", "Dominik Hašek nedokázal chytit poslední střelu od Jágra.",
+                 "Dominik Jágr (68) střelil poslední gól sezóny do branky Jaromíra Haška.", "Dominik Jágr (68) střelil poslední gól sezóny do branky Dominika Haška.", "Jaromír jágr nestřelil gól v sezóně.",
+                  "Davy šílely, když střelily gól do branky Dominika Haška.","Davy šílely, když davy střelily gól do branky Dominika Haška.", "Dav šílel. Jarda střelil gól.", "Dav šílel. Jarda nestřelil gól.",
+                 "Dneska odevzdávám diplomovou práci a koukám na hokej.", "Téma pojednává o hokeji", "Téma pojednává o baletu", "Dominik hašek je brankář", "Dominik hašek je útočník", "Jaromír Jágr je střelec", "Jaromír Jágr je hokejový útočník",
+                  "Jaromír Jágr je hokejový brankář", "Na utkání se dívaly davy lidí, které byly potichu.", "Na utkání se dívaly davy lidí, které šílely."]
+        print("EXAMPLES:")
+        print("context:",contexts[0])
+        print("SCORE: ", "claims:")
+        for co, cl in zip(contexts*len(claims),claims):
+            print(round(self.score(co,cl,eval_mode="nli_sp").tolist()[0],5),cl)
+        print("EXAMPLES QA:")
+        print("SCORE: ", "q-a pairs:")
+        claims = [("Kdo střelil gól?", "Jaromír Jágr."), ("Kdo střelil gól?", "Domink Hašek."), ("Kdo nechytil střelu?", "Jaromír Jágr."), ("Kdo nechytil střelu?", "Domink Hašek.")
+                  , ("Jaký má číslo drezu Jaromír Jágr?", "Jaromír Jágr má číslo drezu 68."), ("Kolik je Jaromíru Jágrovi let?", "Jaromíru Jágrovi je 68."), ("Kolik je Jaromíru Jágrovi let?", "Jaromíru Jágrovi je 67.")
+                 , ("Co udělali lidi, když Jágr střelil gól?", "Lidi začali šílet. Dokonce šílely i po zápase."), ("Co udělali lidi, když Jágr střelil gól?", "Šli dát góla Haškovi")]
+        for co,cl in zip(contexts*len(claims),claims):
+            print(round(model.score(co, cl[1],eval_mode="nli_sp",eval_question_answer=cl[0] ).tolist()[0],5)," ".join(cl))