pragmatic-programs
/

human-data-ft-listener

+from dataclasses import dataclass
+from typing import Optional, List
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
+import regex as re
+import torch
+import torch.nn.functional as F
+PROGRAM_SPECIAL_TOKEN="<extra_id_124>"
+UTTERANCES_SPECIAL_TOKEN="<extra_id_123>"
+GT_PROGRAM_SPECIAL_TOKEN="<extra_id_122>"
+def consistent(rx, spec):
+    # spec is in the form of (string, '+'/'-') pairs
+    for s, label in spec:
+        if not label in ['+', '-']:
+            return None
+        try:
+            if re.fullmatch(rx, s, timeout=1):
+                if label == '-':
+                    return False
+            else:
+                if label == '+':
+                    return False
+        except re.error:
+            return None
+        except TimeoutError:
+            return None
+    return True
+def get_utterance_processing_functions(label_pos, idx, separator=' '):
+    if label_pos == "suffix":
+        if idx:
+            def utterances_to_string(spec):
+                return ''.join([f"<extra_id_{i}>{s}{label}" for i, (s, label) in enumerate(spec)])
+        else:
+            def utterances_to_string(spec):
+                return separator.join([f"{s}{label}" for s, label in spec])
+    else:
+        if idx:
+            def utterances_to_string(spec):
+                return ''.join([f"<extra_id_{i}>{label}{s}" for i, (s, label) in enumerate(spec)])
+        else:
+            def utterances_to_string(spec):
+                return separator.join([f"{label}{s}" for s, label in spec])
+    if label_pos == "suffix":
+        if idx:
+            def string_to_utterances(string):
+                string = re.sub(r'<extra_id_\d+>', ' ', string)
+                return [(s[:-1], s[-1]) for s in string.split(' ') if len(s) > 0]
+        else:
+            def string_to_utterances(string):
+                return [(s[:-1], s[-1]) for s in string.split(separator) if len(s) > 0]
+    else:
+        if idx:
+            def string_to_utterances(string):
+                string = re.sub(r'<extra_id_\d+>', '', string)
+                return [(s[1:], s[0]) for s in string.split(separator) if len(s) > 0]
+        else:
+            def string_to_utterances(string):
+                return [(s[1:], s[0]) for s in string.split(separator) if len(s) > 0]
+    return utterances_to_string, string_to_utterances
+def decode(c):
+    if c < 3:
+        return f"<{c}>"
+    elif c < 258:
+        return chr(c - 3)
+    else:
+        return f"<extra_id_{c - 259}>"
+def byt5_decode_batch(outputs, skip_special_tokens=True, skip_position_token=False):
+    skipped_tokens = outputs
+    if skip_special_tokens:
+        skipped_tokens = [
+            [[t for t in x if t >= 3] for x in beam]
+            for beam in skipped_tokens
+            ]
+    if skip_position_token:
+        skipped_tokens = [
+            [[t for t in x if t <= 258] for x in beam]
+            for beam in skipped_tokens
+            ]
+    return [
+        [''.join([decode(t) for t in x]) for x in beam]
+        for beam in skipped_tokens
+    ]
+class Agent:
+    def __init__(self,
+                model_path: str,
+                gen_config: dict,
+                device: str = "cuda",
+                ):
+        self.device = device
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        self.gen_config = GenerationConfig(**gen_config)
+@dataclass
+class ListenerOutput:
+    programs: List[List[str]]
+    idx: Optional[List[List[int]]] = None
+    decoded: Optional[List[List[str]]] = None
+    decoded_scores: Optional[List[List[float]]] = None
+    pruned: Optional[List[List[str]]] = None
+class Listener(Agent):
+    def __init__(self,
+        model_path,
+        gen_config,
+        device="cuda",
+        label_pos="suffix",
+        idx: bool=True,
+        program_special_token=PROGRAM_SPECIAL_TOKEN,
+        utterances_special_token=UTTERANCES_SPECIAL_TOKEN
+    ):
+        super().__init__(
+            model_path,
+            gen_config,
+            device=device
+        )
+        self.label_pos = label_pos
+        self.idx = idx
+        self.program_special_token = program_special_token
+        self.utterances_special_token = utterances_special_token
+        self.utterances_to_string, self.string_to_utterances = (
+            get_utterance_processing_functions(
+                label_pos, idx, separator=utterances_special_token
+                )
+            )
+    def synthesize(self, context, return_scores=False, enforce_consistency=True):
+        # If context is a list of utterances, convert to string
+        if isinstance(context[0], list):
+            context_str = list(map(self.utterances_to_string, context))
+        else:
+            context_str = context
+        context_tokens = self.tokenizer(
+            [f"{self.utterances_special_token}{c}" if not c.startswith(self.utterances_special_token) else c
+            for c in context_str],
+            return_tensors="pt",
+            padding=True
+            ).to(self.device)
+        decoder_inputs = self.tokenizer(
+            [self.program_special_token for _ in context], return_tensors="pt",
+            add_special_tokens=False
+            ).to(self.device)
+        outputs = self.model.generate(**context_tokens,
+                                      decoder_input_ids=decoder_inputs.input_ids,
+                                      generation_config=self.gen_config,
+                                      return_dict_in_generate=True,
+                                      output_scores=True
+                                      )
+        decoded_batch = byt5_decode_batch(outputs.sequences.reshape((len(context), -1, outputs.sequences.shape[-1])).tolist(), skip_position_token=True, skip_special_tokens=True)
+        consistent_programs = []
+        idxs = []
+        for decoded, ctx in zip(decoded_batch, context):
+            cp = []
+            idx = []
+            for i, p in enumerate(decoded):
+                if enforce_consistency:
+                    if consistent(p, ctx):
+                        cp.append(p)
+                        idx.append(i)
+                else:
+                    cp.append(p)
+                    idx.append(i)
+            consistent_programs.append(cp)
+            idxs.append(idx)
+        logprobs = torch.stack(outputs.scores, dim=1).log_softmax(dim=-1)
+        gen_probs = torch.gather(logprobs, 2, outputs.sequences[:, 1:, None]).squeeze(-1)
+        gen_probs.masked_fill_(gen_probs.isinf(), 0)
+        scores = gen_probs.sum(-1)
+        n_decoded = scores.shape[0]
+        n_seq = n_decoded // len(context)
+        scores = scores.reshape((len(context), n_seq))
+        scores_list = scores.tolist()
+        if return_scores:
+            return ListenerOutput(
+                consistent_programs,
+                idxs,
+                decoded_batch,
+                scores_list
+                )
+        else:
+            return ListenerOutput(consistent_programs)
+    def score_program(self, contexts, programs):
+        if isinstance(contexts[0], list):
+            context_str = list(map(self.utterances_to_string, contexts))
+        else:
+            context_str = contexts
+        context_tokens = self.tokenizer(
+            [f"{self.utterances_special_token}{c}" if not c.startswith(self.utterances_special_token) else c
+            for c in context_str],
+            return_tensors="pt",
+            padding=True
+            ).to(self.device)
+        program_tokens = self.tokenizer([f"{self.program_special_token}{p}" for p in programs], return_tensors="pt").to(self.device)
+        outputs = self.model(input_ids=context_tokens.input_ids, decoder_input_ids=program_tokens.input_ids, return_dict=True)
+        logprobs = torch.gather(F.log_softmax(outputs.logits, dim=-1), 2, program_tokens.input_ids[:, 1:, None]).squeeze(-1)
+        logprobs.masked_fill_(program_tokens.input_ids[:, 1:] == 0, 0)
+        scores = logprobs.sum(-1)
+        return scores.tolist()