Spaces:

stanley-00
/

slm-testing

Running on Zero

App Files Files Community

Add model switching interruption and support for reasoning model tokens

by deleted - opened Jun 8

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+61

-6

Files changed (1) hide show

app.py +61 -6

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 import gc
 import os
@@ -49,10 +49,17 @@ class ModelManager:
     def __init__(self):
         self.model = None
         self.tokenizer = None
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
 model_manager = ModelManager()
 def get_system_stats(request: gr.Request = None):
     """Returns a dictionary of current system metrics with formatted strings."""
     mem = psutil.virtual_memory()
@@ -66,9 +73,13 @@ def get_system_stats(request: gr.Request = None):
 def load_new_model(model_id):
     """Loads the model and tokenizer dynamically into the global manager."""
     # Clear old model from memory
     model_manager.model = None
     model_manager.tokenizer = None
     yield f"Loading {model_id}..."
     gc.collect()
     if torch.cuda.is_available():
@@ -81,6 +92,7 @@ def load_new_model(model_id):
         model_manager.tokenizer = tokenizer
         model_manager.model = model
         yield f"Successfully loaded {model_id} on {model_manager.device.upper()}"
     except Exception as e:
@@ -91,15 +103,33 @@ def run_inference(user_prompt, max_tokens, temperature, top_k, top_p, rep_penalt
     if model_manager.model is None or model_manager.tokenizer is None:
         yield "Please load a model first.", "Model not loaded"
         return
     tokenizer = model_manager.tokenizer
     model = model_manager.model
     # Tokenize input
-    inputs = tokenizer([user_prompt], return_tensors="pt").to(model_manager.device)
     # Set up the streamer
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     # Adjust variables based on the do_sample logic
     if not do_sample:
@@ -116,7 +146,8 @@ def run_inference(user_prompt, max_tokens, temperature, top_k, top_p, rep_penalt
         repetition_penalty=float(rep_penalty),
         no_repeat_ngram_size=int(ngram_size),
         do_sample=do_sample,
-        pad_token_id=tokenizer.eos_token_id # Prevents padding warnings
     )
     start_time = time.time()
@@ -124,15 +155,39 @@ def run_inference(user_prompt, max_tokens, temperature, top_k, top_p, rep_penalt
     thread = Thread(target=model.generate, kwargs=generate_kwargs)
     thread.start()
     # Yield output iteratively for the streaming effect
-    generated_text = user_prompt
     token_count = 0
     for new_text in streamer:
         generated_text += new_text
         token_count += 1
         duration = time.time() - start_time
         tps = token_count / duration if duration > 0 else 0
-        yield generated_text, f"Speed: {tps:.2f} tokens/sec"
 def clean_cache():
     if os.path.exists(HF_CACHE_DIR):

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList
 from threading import Thread
 import gc
 import os
     def __init__(self):
         self.model = None
         self.tokenizer = None
+        self.model_id = None
+        self.stop_generation = False # Added flag to instantly kill generation
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
 model_manager = ModelManager()
+# Custom stopping criteria to halt the generation thread when loading a new model
+class StopOnFlag(StoppingCriteria):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        return model_manager.stop_generation
 def get_system_stats(request: gr.Request = None):
     """Returns a dictionary of current system metrics with formatted strings."""
     mem = psutil.virtual_memory()
 def load_new_model(model_id):
     """Loads the model and tokenizer dynamically into the global manager."""
+    # Stop any ongoing generation immediately
+    model_manager.stop_generation = True
     # Clear old model from memory
     model_manager.model = None
     model_manager.tokenizer = None
+    model_manager.model_id = None
     yield f"Loading {model_id}..."
     gc.collect()
     if torch.cuda.is_available():
         model_manager.tokenizer = tokenizer
         model_manager.model = model
+        model_manager.model_id = model_id
         yield f"Successfully loaded {model_id} on {model_manager.device.upper()}"
     except Exception as e:
     if model_manager.model is None or model_manager.tokenizer is None:
         yield "Please load a model first.", "Model not loaded"
         return
+    # Reset the stop flag for the new generation run
+    model_manager.stop_generation = False
     tokenizer = model_manager.tokenizer
     model = model_manager.model
+    model_id = model_manager.model_id
+    is_supra_reasoning = "Supra-50M-Reasoning" in model_id if model_id else False
+    if is_supra_reasoning:
+        SYSTEM_PROMPT = "Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions."
+        prompt_to_encode = (
+            f"[SYSTEM]: {SYSTEM_PROMPT}\n\n"
+            f"[USER]: {user_prompt}\n\n"
+            f"[ASSISTANT]: <|begin_of_thought|>\n"
+        )
+        skip_special = False
+    else:
+        prompt_to_encode = user_prompt
+        skip_special = True
     # Tokenize input
+    inputs = tokenizer([prompt_to_encode], return_tensors="pt").to(model_manager.device)
     # Set up the streamer
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=skip_special)
     # Adjust variables based on the do_sample logic
     if not do_sample:
         repetition_penalty=float(rep_penalty),
         no_repeat_ngram_size=int(ngram_size),
         do_sample=do_sample,
+        pad_token_id=tokenizer.eos_token_id, # Prevents padding warnings
+        stopping_criteria=StoppingCriteriaList([StopOnFlag()]) # Attach the stopping criteria
     )
     start_time = time.time()
     thread = Thread(target=model.generate, kwargs=generate_kwargs)
     thread.start()
+    if is_supra_reasoning:
+        # Use plain text formatting rather than markdown symbols inside gr.Textbox
+        base_display = f"Prompt: {user_prompt}\n\n----------------------------------------\n\n"
+        generated_text = ""
+    else:
+        base_display = ""
+        generated_text = user_prompt
     # Yield output iteratively for the streaming effect
     token_count = 0
     for new_text in streamer:
+        # Immediately break out of the UI update loop if a new model is loaded
+        if model_manager.stop_generation:
+            break
         generated_text += new_text
         token_count += 1
         duration = time.time() - start_time
         tps = token_count / duration if duration > 0 else 0
+        display_text = generated_text
+        if is_supra_reasoning:
+            display_text = display_text.replace("<s>", "").replace("</s>", "")
+            if not display_text.startswith("🧠 Thinking Process:"):
+                display_text = "🧠 Thinking Process:\n" + display_text
+            display_text = display_text.replace("<|begin_of_thought|>", "🧠 Thinking Process:\n")
+            display_text = display_text.replace("<|end_of_thought|>", "\n\n")
+            display_text = display_text.replace("<|begin_of_solution|>", "✅ Final Answer:\n\n")
+            display_text = display_text.replace("<|end_of_solution|>", "")
+        yield base_display + display_text, f"Speed: {tps:.2f} tokens/sec"
 def clean_cache():
     if os.path.exists(HF_CACHE_DIR):