Spaces:

akhaliq
/

KAT-Dev

Sleeping

App Files Files Community

akhaliq HF Staff commited on Sep 27, 2025

Commit

eae8d97

verified ·

1 Parent(s): af9ed97

Update Gradio app with multiple files

Browse files

Files changed (1) hide show

models.py +51 -33

models.py CHANGED Viewed

@@ -69,12 +69,18 @@ def stream_generate_response(prompt: str, history: list) -> Generator[str, None,
     for human, bot in history:
         # Add past exchanges
         if human:
-            messages.append({"role": "user", "content": human})
         if bot:
-            messages.append({"role": "assistant", "content": bot})
     # Add the current prompt
-    messages.append({"role": "user", "content": prompt})
     # Apply chat template
     text = tokenizer.apply_chat_template(
@@ -86,45 +92,57 @@ def stream_generate_response(prompt: str, history: list) -> Generator[str, None,
     # Prepare inputs and move to model device
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    # Use TextStreamer for efficient token streaming
-    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Start generation in a separate thread (TextStreamer uses an internal blocking mechanism)
-    # Since Gradio's generator interface expects synchronous yields from the main thread
-    # within the @spaces.GPU context, we need to adapt the TextStreamer output.
-    # A cleaner approach for Gradio streaming is direct model generation without TextStreamer:
     input_ids = model_inputs.input_ids
     generated_ids = model.generate(
         input_ids=input_ids,
         max_new_tokens=MAX_NEW_TOKENS,
         do_sample=DO_SAMPLE,
         temperature=TEMPERATURE,
         pad_token_id=tokenizer.eos_token_id,
-        return_dict_in_generate=True,
-        output_scores=True,
-        min_new_tokens=1,
-        # Enable iterative decoding
         repetition_penalty=1.1,
     )
-    full_response = ""
-    # Process output sequence token by token
-    for seq in generated_ids.sequences:
-        # Get the new tokens generated after the prompt
-        new_tokens = seq[input_ids.shape[-1]:]
-        # Decode only the newly generated part of the sequence so far
-        current_response = tokenizer.decode(new_tokens, skip_special_tokens=True)
-        # Yield only the difference from the previous chunk
-        if len(current_response) > len(full_response):
-            new_text = current_response[len(full_response):]
-            full_response = current_response
-            yield new_text
-    # Final cleanup (sometimes the model output is slightly messy)
-    if full_response:
-        yield full_response.strip()

     for human, bot in history:
         # Add past exchanges
         if human:
+            messages.append({
+"role": "user", "content": human
+})
         if bot:
+            messages.append({
+"role": "assistant", "content": bot
+})
     # Add the current prompt
+    messages.append({
+"role": "user", "content": prompt
+})
     # Apply chat template
     text = tokenizer.apply_chat_template(
     # Prepare inputs and move to model device
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # Create a custom streamer that works with Gradio
+    class GradioStreamer:
+        def __init__(self, tokenizer):
+            self.tokenizer = tokenizer
+            self.text_queue = []
+            self.generated_text = ""
+        def put(self, value):
+            # Decode the new tokens and add to queue
+            if isinstance(value, torch.Tensor):
+                new_text = self.tokenizer.decode(value, skip_special_tokens=True)
+                # Only yield the new part
+                if new_text.startswith(self.generated_text):
+                    new_part = new_text[len(self.generated_text):]
+                    if new_part:
+                        self.text_queue.append(new_part)
+                        self.generated_text = new_text
+                else:
+                    # Sometimes the decoding might not align perfectly
+                    self.text_queue.append(new_text)
+                    self.generated_text = new_text
+        def end(self):
+            pass
+        def __iter__(self):
+            return iter(self.text_queue)
+    # Create our custom streamer
+    gradio_streamer = GradioStreamer(tokenizer)
+    # Generate with streaming
     input_ids = model_inputs.input_ids
+    # Generate tokens one by one for true streaming
     generated_ids = model.generate(
         input_ids=input_ids,
         max_new_tokens=MAX_NEW_TOKENS,
         do_sample=DO_SAMPLE,
         temperature=TEMPERATURE,
         pad_token_id=tokenizer.eos_token_id,
+        streamer=gradio_streamer,
         repetition_penalty=1.1,
     )
+    # Yield the text as it's generated
+    accumulated_text = ""
+    for new_chunk in gradio_streamer.text_queue:
+        accumulated_text += new_chunk
+        yield accumulated_text
+    # Final yield to ensure complete text is sent
+    if accumulated_text:
+        yield accumulated_text.strip()