Spaces:

m97j
/

pragmatic-agent

Sleeping

App Files Files Community

m97j commited on Dec 14, 2025

Commit

deb604d

1 Parent(s): ac17ed0

fix(llm_model): align token chunking and prefix handling with engine

Browse files

Files changed (1) hide show

models/llm_model.py +16 -21

models/llm_model.py CHANGED Viewed

@@ -37,12 +37,12 @@ class LLMService:
         if isinstance(prompt, torch.Tensor):
             if mode.lower() == "instruct":
                 if "instruct" not in self._prefix_cache:
-                    self._prefix_cache["instruct"] = self.engine.tokenize("/no_think\n")
                 return torch.cat([self._prefix_cache["instruct"], prompt], dim=-1)
             if mode.lower() == "think":
                 if "think" not in self._prefix_cache:
-                    self._prefix_cache["think"] = self.engine.tokenize("/think\n")
                 return torch.cat([self._prefix_cache["think"], prompt], dim=-1)
             return prompt
@@ -73,41 +73,36 @@ class LLMService:
         """
         Util: split text into token chunks not exceeding max_tokens,
         trying to respect sentence boundaries where possible.
         """
         max_tokens = min(14000, max_tokens)
         encodings = self.engine.tokenize(text, return_offsets=True)
-        tokens = encodings["input_ids"]
-        offsets = encodings["offset_mapping"]
-        # detect sentence boundaries
         sentence_boundaries = set(split_content(text, return_boundaries=True))
         chunks = []
-        current_chunk = []
-        current_len = 0
-        for i, tok in enumerate(tokens):
-            current_chunk.append(tok)
-            current_len += 1
-            _, end = offsets[i]
-            if current_len >= max_tokens:
                 boundary_candidates = [b for b in sentence_boundaries if b <= end]
                 if boundary_candidates:
                     boundary_index = max(boundary_candidates)
                     cutoff_token_index = max(
                         j for j, (s, e) in enumerate(offsets[:i+1]) if e <= boundary_index
                     )
-                    chunks.append(current_chunk[:cutoff_token_index+1])
-                    current_chunk = current_chunk[cutoff_token_index+1:]
-                    current_len = len(current_chunk)
                 else:
-                    chunks.append(current_chunk)
-                    current_chunk = []
-                    current_len = 0
-        if current_chunk:
-            chunks.append(current_chunk)
         return chunks

         if isinstance(prompt, torch.Tensor):
             if mode.lower() == "instruct":
                 if "instruct" not in self._prefix_cache:
+                    self._prefix_cache["instruct"] = self.engine.tokenize("/no_think\n")["input_ids"]
                 return torch.cat([self._prefix_cache["instruct"], prompt], dim=-1)
             if mode.lower() == "think":
                 if "think" not in self._prefix_cache:
+                    self._prefix_cache["think"] = self.engine.tokenize("/think\n")["input_ids"]
                 return torch.cat([self._prefix_cache["think"], prompt], dim=-1)
             return prompt
         """
         Util: split text into token chunks not exceeding max_tokens,
         trying to respect sentence boundaries where possible.
+        Returns: List[torch.Tensor] (each tensor is a chunk of token IDs, still on CPU)
         """
         max_tokens = min(14000, max_tokens)
         encodings = self.engine.tokenize(text, return_offsets=True)
+        tokens = encodings["input_ids"][0]          # shape: (N,)
+        offsets = encodings["offset_mapping"][0]    # shape: (N, 2)
+        # detect sentence boundaries (character-level positions in original text)
         sentence_boundaries = set(split_content(text, return_boundaries=True))
         chunks = []
+        start = 0
+        for i, (_, end) in enumerate(offsets):
+            # If the current chunk length is greater than max_tokens, break it based on the boundary.
+            if (i - start + 1) >= max_tokens:
                 boundary_candidates = [b for b in sentence_boundaries if b <= end]
                 if boundary_candidates:
                     boundary_index = max(boundary_candidates)
                     cutoff_token_index = max(
                         j for j, (s, e) in enumerate(offsets[:i+1]) if e <= boundary_index
                     )
+                    chunks.append(tokens[start:cutoff_token_index+1])
+                    start = cutoff_token_index + 1
                 else:
+                    chunks.append(tokens[start:i+1])
+                    start = i + 1
+        if start < len(tokens):
+            chunks.append(tokens[start:])
         return chunks