Spaces:

sds-ai
/

Yee-R1-Demo

Sleeping

App Files Files Community

Shining-Data commited on Jun 4, 2025

Commit

5cf5d21

verified ·

1 Parent(s): 5ba955b

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -42

app.py CHANGED Viewed

@@ -7,14 +7,19 @@ from datetime import datetime
 import re  # for parsing <think> blocks
 import gradio as gr
 import torch
-from transformers import pipeline, TextIteratorStreamer
-from transformers import AutoTokenizer
 from duckduckgo_search import DDGS
 # import spaces  # Import spaces early to enable ZeroGPU support
 # Optional: Disable GPU visibility if you wish to force CPU usage
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 # ------------------------------
 # Global Cancellation Event
 # ------------------------------
@@ -43,30 +48,13 @@ def load_pipeline(model_name):
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     tokenizer = AutoTokenizer.from_pretrained(repo)
-    for dtype in (torch.bfloat16, torch.float16, torch.float32):
-        try:
-            pipe = pipeline(
-                task="text-generation",
-                model=repo,
-                tokenizer=tokenizer,
-                trust_remote_code=True,
-                torch_dtype=dtype,
-                device=-1        # CPU only # device_map="auto"
-            )
-            PIPELINES[model_name] = pipe
-            return pipe
-        except Exception:
-            continue
-    # Final fallback
-    pipe = pipeline(
-        task="text-generation",
-        model=repo,
-        tokenizer=tokenizer,
-        trust_remote_code=True,
-        device=-1        # CPU only # device_map="auto"
-    )
-    PIPELINES[model_name] = pipe
-    return pipe
 def retrieve_context(query, max_results=6, max_chars=600):
@@ -153,19 +141,21 @@ def chat_response(user_msg, chat_history, system_prompt,
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)
-        gen_thread = threading.Thread(
-            target=pipe,
-            args=(prompt,),
-            kwargs={
-                'max_new_tokens': max_tokens,
-                'temperature': temperature,
-                'top_k': top_k,
-                'top_p': top_p,
-                'repetition_penalty': repeat_penalty,
-                'streamer': streamer,
-                'return_full_text': False,
-            }
-        )
         gen_thread.start()
         # Buffers for thought vs answer
@@ -253,11 +243,11 @@ with gr.Blocks(title="Yee R1 Demo") as demo:
     with gr.Row():
         with gr.Column(scale=3):
             model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
-            search_chk = gr.Checkbox(label="Enable Web Search", value=True)
             sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
             gr.Markdown("### Generation Parameters")
-            max_tok = gr.Slider(64, 16384, value=2048, step=32, label="Max Tokens")
-            temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
             k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
             p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
             rp = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="Repetition Penalty")

 import re  # for parsing <think> blocks
 import gradio as gr
 import torch
+from transformers import TextIteratorStreamer
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from duckduckgo_search import DDGS
 # import spaces  # Import spaces early to enable ZeroGPU support
 # Optional: Disable GPU visibility if you wish to force CPU usage
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
+if torch.cuda.is_available():
+    device = "auto"
+else:
+    device = "cpu"
 # ------------------------------
 # Global Cancellation Event
 # ------------------------------
         return PIPELINES[model_name]
     repo = MODELS[model_name]["repo_id"]
     tokenizer = AutoTokenizer.from_pretrained(repo)
+    model = AutoModelForCausalLM.from_pretrained(
+            repo,
+            device_map=device,
+            trust_remote_code=True,
+        )
+    PIPELINES[model_name] = {"tokenizer": tokenizer, "model": model}
+    return PIPELINES[model_name]
 def retrieve_context(query, max_results=6, max_chars=600):
         streamer = TextIteratorStreamer(pipe.tokenizer,
                                         skip_prompt=True,
                                         skip_special_tokens=True)
+        generation_config = dict(
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                max_new_tokens=max_tokens,
+                do_sample=True,
+                repetition_penalty=repeat_penalty,
+                streamer=streamer,
+            )
+        inputs = pipe["tokenizer"](prompt, return_tensors="pt")
+        if device == "auto":
+            input_ids = inputs["input_ids"].cuda()
+        else:
+            input_ids = inputs["input_ids"]
+        gen_thread = threading.Thread(target=lambda: pipe["model"].generate(input_ids=input_ids, **generation_config))
         gen_thread.start()
         # Buffers for thought vs answer
     with gr.Row():
         with gr.Column(scale=3):
             model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
+            search_chk = gr.Checkbox(label="Enable Web Search", value=False)
             sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
             gr.Markdown("### Generation Parameters")
+            max_tok = gr.Slider(64, 16384, value=4096, step=32, label="Max Tokens")
+            temp = gr.Slider(0.1, 2.0, value=0.6, step=0.1, label="Temperature")
             k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
             p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
             rp = gr.Slider(1.0, 2.0, value=1.2, step=0.1, label="Repetition Penalty")