Spaces:

stanley-00
/

slm-testing

Running

App Files Files Community

stanley-00 commited on 3 days ago

Commit

ee07f77

verified ·

1 Parent(s): 9e4c82c

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -35

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
-from transformers import pipeline
 import gc
 import os
 import shutil
@@ -26,6 +27,15 @@ MODELS = [
     'ThingAI/Quark-50m', 'ThingAI/Quark-135m'
 ]
 def get_system_stats():
     """Returns a dictionary of current system metrics with formatted strings."""
     mem = psutil.virtual_memory()
@@ -37,31 +47,68 @@ def get_system_stats():
     }
 def load_new_model(model_id):
     # Clear old model from memory
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     try:
-        # Load a text-generation pipeline with trust_remote_code enabled
-        pipe = pipeline("text-generation", model=model_id, trust_remote_code=True)
-        return pipe, f"Successfully loaded {model_id}"
     except Exception as e:
-        return None, f"Error loading model: {str(e)}"
-def run_inference(model, user_prompt, max_tokens, temperature, top_k):
-    if not model:
-        return "Please load a model first."
-    # Run inference with additional sampling parameters
-    result = model(
-        user_prompt,
         max_new_tokens=int(max_tokens),
         temperature=float(temperature),
         top_k=int(top_k),
-        do_sample=True
     )
-    return result[0]['generated_text']
 def clean_cache():
     if os.path.exists(HF_CACHE_DIR):
@@ -71,47 +118,69 @@ def clean_cache():
     return "Cache directory not found."
 # Gradio Interface
-with gr.Blocks(title="Small MF Model Tester") as app:
-    current_model = gr.State(None)
     with gr.Row():
-        # Left column: Settings
         with gr.Column(scale=1):
-            # Stats Section
             with gr.Accordion("System Monitoring", open=True):
                 stats_output = gr.JSON(label="Live System Stats")
-                gr.Timer(5).tick(get_system_stats, None, stats_output)
-            model_id_input = gr.Dropdown(choices=MODELS, label="Model", allow_custom_value=True, show_label=False)
-            max_tokens_input = gr.Slider(minimum=10, maximum=1024, value=128, step=1, label="Max Output Tokens")
-            temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
-            top_k_input = gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top-K Sampling")
-            load_btn = gr.Button("Load", variant="secondary")
-            clean_btn = gr.Button("Clean", variant="stop")
-            status_output = gr.Markdown("Status: Waiting to load model...")
         # Right column: Interaction
         with gr.Column(scale=2):
-            user_prompt = gr.Textbox(label="Prompt", value="Once upon a time,", placeholder="Enter your prompt here...", lines=5)
-            run_btn = gr.Button("Run Inference", variant="primary")
-            output_text = gr.Textbox(label="Result", lines=10)
     # Events
     load_btn.click(
         fn=load_new_model,
         inputs=[model_id_input],
-        outputs=[current_model, status_output]
     )
     run_btn.click(
         fn=run_inference,
-        inputs=[current_model, user_prompt, max_tokens_input, temperature_input, top_k_input],
         outputs=[output_text]
     )

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
 import gc
 import os
 import shutil
     'ThingAI/Quark-50m', 'ThingAI/Quark-135m'
 ]
+# Global class to safely manage the loaded model and tokenizer in memory
+class ModelManager:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+model_manager = ModelManager()
 def get_system_stats():
     """Returns a dictionary of current system metrics with formatted strings."""
     mem = psutil.virtual_memory()
     }
 def load_new_model(model_id):
+    """Loads the model and tokenizer dynamically into the global manager."""
     # Clear old model from memory
+    model_manager.model = None
+    model_manager.tokenizer = None
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     try:
+        # Load explicitly for streaming purposes instead of pipeline
+        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(model_manager.device)
+        model_manager.tokenizer = tokenizer
+        model_manager.model = model
+        return f"Successfully loaded {model_id} on {model_manager.device.upper()}"
     except Exception as e:
+        return f"Error loading model: {str(e)}"
+def run_inference(user_prompt, max_tokens, temperature, top_k, top_p, rep_penalty, ngram_size, do_sample):
+    """Generates text via streaming generator."""
+    if model_manager.model is None or model_manager.tokenizer is None:
+        yield "Please load a model first."
+        return
+    tokenizer = model_manager.tokenizer
+    model = model_manager.model
+    # Tokenize input
+    inputs = tokenizer([user_prompt], return_tensors="pt").to(model_manager.device)
+    # Set up the streamer
+    streamer = TextIteratorStreamer(tokenizer, timeout=15.0, skip_prompt=True, skip_special_tokens=True)
+    # Adjust variables based on the do_sample logic
+    if not do_sample:
+        temperature = 1.0 # Temperature is ignored if do_sample=False, but setting it > 0 avoids config errors
+    # Generation arguments
+    generate_kwargs = dict(
+        **inputs,
+        streamer=streamer,
         max_new_tokens=int(max_tokens),
         temperature=float(temperature),
         top_k=int(top_k),
+        top_p=float(top_p),
+        repetition_penalty=float(rep_penalty),
+        no_repeat_ngram_size=int(ngram_size),
+        do_sample=do_sample,
+        pad_token_id=tokenizer.eos_token_id # Prevents padding warnings
     )
+    # Start generation in a separate background thread
+    thread = Thread(target=model.generate, kwargs=generate_kwargs)
+    thread.start()
+    # Yield output iteratively for the streaming effect
+    generated_text = user_prompt
+    for new_text in streamer:
+        generated_text += new_text
+        yield generated_text
 def clean_cache():
     if os.path.exists(HF_CACHE_DIR):
     return "Cache directory not found."
 # Gradio Interface
+with gr.Blocks(title="Small MF Model Tester", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🚀 Small Model Evaluation Hub with Streaming")
     with gr.Row():
+        # Left column: Settings & Monitoring
         with gr.Column(scale=1):
             with gr.Accordion("System Monitoring", open=True):
                 stats_output = gr.JSON(label="Live System Stats")
+                gr.Timer(2).tick(get_system_stats, None, stats_output)
+            with gr.Group():
+                gr.Markdown("### Model Loader")
+                with gr.Row():
+                    model_id_input = gr.Dropdown(choices=MODELS, label="Model", allow_custom_value=True, show_label=False, scale=3)
+                    load_btn = gr.Button("Load", variant="secondary", scale=1)
+                status_output = gr.Markdown("Status: *Waiting to load model...*")
+                clean_btn = gr.Button("Clean HF Cache", variant="stop", size="sm")
+            with gr.Accordion("Generation Configuration", open=False):
+                do_sample_input = gr.Checkbox(label="Enable Sampling (do_sample)", value=True, info="Uncheck for greedy decoding")
+                max_tokens_input = gr.Slider(minimum=10, maximum=2048, value=128, step=1, label="Max Output Tokens")
+                temperature_input = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Higher = more creative")
+                gr.Markdown("#### Advanced Sampling Constraints")
+                top_k_input = gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top-K", info="0 = disabled")
+                top_p_input = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P (Nucleus)", info="1.0 = disabled")
+                rep_penalty_input = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty", info="1.0 = disabled")
+                ngram_size_input = gr.Slider(minimum=0, maximum=10, value=0, step=1, label="No Repeat N-Gram Size", info="0 = disabled")
         # Right column: Interaction
         with gr.Column(scale=2):
+            user_prompt = gr.Textbox(
+                label="Prompt",
+                value="Once upon a time in a digital kingdom,",
+                placeholder="Enter your prompt here...",
+                lines=5
+            )
+            run_btn = gr.Button("Generate text (Stream)", variant="primary", size="lg")
+            output_text = gr.Textbox(label="Result", lines=15, show_copy_button=True)
     # Events
     load_btn.click(
         fn=load_new_model,
         inputs=[model_id_input],
+        outputs=[status_output]
     )
+    # We use `.click` targeting a generator function, which Gradio naturally treats as a streaming output
     run_btn.click(
         fn=run_inference,
+        inputs=[
+            user_prompt,
+            max_tokens_input,
+            temperature_input,
+            top_k_input,
+            top_p_input,
+            rep_penalty_input,
+            ngram_size_input,
+            do_sample_input
+        ],
         outputs=[output_text]
     )