Spaces:

sds-ai
/

Yee-R1-Demo

Sleeping

App Files Files Community

Shining-Data commited on Jun 4, 2025

Commit

24fbba8

verified ·

1 Parent(s): 053d245

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -69

app.py CHANGED Viewed

@@ -9,13 +9,14 @@ from datetime import datetime
 import re  # for parsing <think> blocks
 import gradio as gr
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 from duckduckgo_search import DDGS
 from transformers import modeling_utils
 if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
     modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none","colwise",'rowwise']
 # import spaces  # Import spaces early to enable ZeroGPU support
 # Optional: Disable GPU visibility if you wish to force CPU usage
@@ -44,69 +45,66 @@ MODELS = {
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
-class TextIterStreamer:
-    def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
-        self.tokenizer = tokenizer
-        self.skip_prompt = skip_prompt
-        self.skip_special_tokens = skip_special_tokens
-        self.tokens = []
-        self.text_queue = Queue()
-        # self.text_queue = []
-        self.next_tokens_are_prompt = True
-    def put(self, value):
-        if self.skip_prompt and self.next_tokens_are_prompt:
-            self.next_tokens_are_prompt = False
-        else:
-            if len(value.shape) > 1:
-                value = value[0]
-            self.tokens.extend(value.tolist())
-            word = self.tokenizer.decode(self.tokens, skip_special_tokens=self.skip_special_tokens)
-            # self.text_queue.append(word)
-            self.text_queue.put(word)
-    def end(self):
-        # self.text_queue.append(None)
-        self.text_queue.put(None)
-    def __iter__(self):
-        return self
-    def __next__(self):
-        value = self.text_queue.get()
-        if value is None:
-            raise StopIteration()
-        else:
-            return value
 def load_pipeline(model_name):
     """
     Load and cache a transformers pipeline for text generation.
     Tries bfloat16, falls back to float16 or float32 if unsupported.
     """
     global PIPELINES
-    if model_name in PIPELINES.keys():
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     if model_name == "secgpt-mini":
         tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True, subfolder="models")
-        model = AutoModelForCausalLM.from_pretrained(
-                repo,
-                device_map=device,
                 trust_remote_code=True,
-                subfolder="models",
-            )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
-        model = AutoModelForCausalLM.from_pretrained(
-                repo,
                 device_map=device,
-                trust_remote_code=True,
-            )
-    PIPELINES[model_name] = {"tokenizer": tokenizer, "model": model}
-    return {"tokenizer": tokenizer, "model": model}
 def retrieve_context(query, max_results=6, max_chars=600):
@@ -182,26 +180,24 @@ def chat_response(user_msg, chat_history, system_prompt,
                 enriched = system_prompt
         pipe = load_pipeline(model_name)
-        prompt = format_conversation(history, enriched, pipe["tokenizer"])
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
-        streamer = TextIterStreamer(pipe["tokenizer"],
                                         skip_prompt=True,
                                         skip_special_tokens=True)
-        generation_config = dict(
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            max_new_tokens=max_tokens,
-            do_sample=True,
-            repetition_penalty=repeat_penalty,
-            streamer=streamer,
         )
-        inputs = pipe["tokenizer"](prompt, return_tensors="pt")
-        if device == "auto":
-            input_ids = inputs["input_ids"].cuda()
-        else:
-            input_ids = inputs["input_ids"]
-        gen_thread = Thread(target=lambda: pipe["model"].generate(input_ids=input_ids, **generation_config))
         gen_thread.start()
         # Buffers for thought vs answer

 import re  # for parsing <think> blocks
 import gradio as gr
 import torch
+from transformers import pipeline, TextIteratorStreamer
+from transformers import AutoTokenizer
 from duckduckgo_search import DDGS
 from transformers import modeling_utils
 if not hasattr(modeling_utils, "ALL_PARALLEL_STYLES") or modeling_utils.ALL_PARALLEL_STYLES is None:
     modeling_utils.ALL_PARALLEL_STYLES = ["tp", "none","colwise",'rowwise']
 # import spaces  # Import spaces early to enable ZeroGPU support
 # Optional: Disable GPU visibility if you wish to force CPU usage
 # Global cache for pipelines to avoid re-loading.
 PIPELINES = {}
 def load_pipeline(model_name):
     """
     Load and cache a transformers pipeline for text generation.
     Tries bfloat16, falls back to float16 or float32 if unsupported.
     """
     global PIPELINES
+    if model_name in PIPELINES:
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     if model_name == "secgpt-mini":
+        tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True)
+    else:
         tokenizer = AutoTokenizer.from_pretrained(repo, trust_remote_code=True, subfolder="models")
+    for dtype in (torch.bfloat16, torch.float16, torch.float32):
+        try:
+            if model_name == "secgpt-mini":
+                pipe = pipeline(
+                    task="text-generation",
+                    model=repo,
+                    tokenizer=tokenizer,
+                    trust_remote_code=True,
+                    torch_dtype=dtype,
+                    device_map=device,
+                    subfolder="models"
+                )
+            else:
+                pipe = pipeline(
+                    task="text-generation",
+                    model=repo,
+                    tokenizer=tokenizer,
+                    trust_remote_code=True,
+                    torch_dtype=device,
+                    device_map="auto",
+                )
+            PIPELINES[model_name] = pipe
+            return pipe
+        except Exception:
+            continue
+    # Final fallback
+    if model_name == "secgpt-mini":
+        pipe = pipeline(
+                task="text-generation",
+                model=repo,
+                tokenizer=tokenizer,
                 trust_remote_code=True,
+                torch_dtype=dtype,
                 device_map=device,
+                subfolder="models"
+        )
+    else:
+        pipe = pipeline(
+            task="text-generation",
+            model=repo,
+            tokenizer=tokenizer,
+            trust_remote_code=True,
+            device_map=device
+        )
+    PIPELINES[model_name] = pipe
+    return pipe
 def retrieve_context(query, max_results=6, max_chars=600):
                 enriched = system_prompt
         pipe = load_pipeline(model_name)
+        prompt = format_conversation(history, enriched, pipe.tokenizer)
         prompt_debug = f"\n\n--- Prompt Preview ---\n```\n{prompt}\n```"
+        streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)
+        gen_thread = Thread(
+            target=pipe,
+            args=(prompt,),
+            kwargs={
+                'max_new_tokens': max_tokens,
+                'temperature': temperature,
+                'top_k': top_k,
+                'top_p': top_p,
+                'repetition_penalty': repeat_penalty,
+                'streamer': streamer,
+                'return_full_text': False,
+            }
         )
         gen_thread.start()
         # Buffers for thought vs answer