Jooju2872
/

moondream2

@@ -216,6 +216,7 @@ class MoondreamModel(nn.Module):
     def _prefill_prompt(
         self, kv_cache: torch.Tensor, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.no_grad():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             hidden = self.ops["prefill"](
@@ -257,15 +258,40 @@ class MoondreamModel(nn.Module):
         )
         def generator(next_token, pos):
             generated_tokens = 0
             while (
                 next_token_id := next_token.item()
             ) != self.config.tokenizer.eos_id and generated_tokens < max_tokens:
-                yield self.tokenizer.decode([next_token_id])
                 with torch.no_grad():
                     next_emb = text_encoder(next_token, self.text)
                     logits, _, kv_cache_update = self.ops["decode_one_token"](
                         next_emb, kv_cache, pos, self.text, self.config.text
                     )
@@ -273,8 +299,22 @@ class MoondreamModel(nn.Module):
                         kv_cache_update
                     )
                     pos += 1
-                    next_token = torch.argmax(logits, dim=-1)
                     generated_tokens += 1
         return generator(next_token, pos)
@@ -617,3 +657,15 @@ class MoondreamModel(nn.Module):
             )
             return {"gaze": {"x": mean_gaze[0], "y": mean_gaze[1]}}

     def _prefill_prompt(
         self, kv_cache: torch.Tensor, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.no_grad():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             hidden = self.ops["prefill"](
         )
         def generator(next_token, pos):
+            mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
+            mask[:, :, :pos] = 1
+            pos_ids = torch.tensor([pos], device=self.device, dtype=torch.long)
             generated_tokens = 0
+            token_cache = []
+            print_len = 0
             while (
                 next_token_id := next_token.item()
             ) != self.config.tokenizer.eos_id and generated_tokens < max_tokens:
+                # Add token to our cache
+                token_cache.append(next_token_id)
+                # Decode all tokens collected so far
+                text = self.tokenizer.decode(token_cache)
+                # After a newline, we flush the cache completely
+                if text.endswith("\n"):
+                    printable_text = text[print_len:]
+                    token_cache = []
+                    print_len = 0
+                    if printable_text:
+                        yield printable_text
+                # If the last token is a CJK character, we can safely print it
+                elif len(text) > 0 and _is_cjk_char(ord(text[-1])):
+                    printable_text = text[print_len:]
+                    print_len += len(printable_text)
+                    if printable_text:
+                        yield printable_text
                 with torch.no_grad():
                     next_emb = text_encoder(next_token, self.text)
+                    mask[:, :, pos], pos_ids[0] = 1, pos
                     logits, _, kv_cache_update = self.ops["decode_one_token"](
                         next_emb, kv_cache, pos, self.text, self.config.text
                     )
                         kv_cache_update
                     )
                     pos += 1
+                    if temperature == 0:
+                        next_token = torch.argmax(logits, dim=-1)  # (1, 1)
+                    else:
+                        probs = torch.softmax(logits / temperature, dim=-1)  # (1, V)
+                        probs = self._apply_top_p(probs, top_p)
+                        next_token = torch.multinomial(probs, num_samples=1).squeeze(1)  # (1, 1)
                     generated_tokens += 1
+            # Flush any remaining text in the cache
+            if token_cache:
+                text = self.tokenizer.decode(token_cache)
+                printable_text = text[print_len:]
+                if printable_text:
+                    yield printable_text
         return generator(next_token, pos)
             )
             return {"gaze": {"x": mean_gaze[0], "y": mean_gaze[1]}}
+def _is_cjk_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    ):
+        return True
+    return False