Spaces:

InnovisionLLC
/

example_test

Paused

App Files Files Community

Wenye He commited on Feb 17, 2025

Commit

f937954

verified ·

1 Parent(s): 4c5f924

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -33

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
-from threading import Thread
 import torch
 import time
@@ -19,6 +18,13 @@ MODEL_CONFIG = {
     }
 }
 class ChatModel:
     def __init__(self):
         self.models = {}
@@ -28,64 +34,66 @@ class ChatModel:
         if model_name not in self.models:
             config = MODEL_CONFIG[model_name]
-            self.tokenizers[model_name] = AutoTokenizer.from_pretrained(config["model_name"])
-            self.tokenizers[model_name].pad_token = self.tokenizers[model_name].eos_token
-            self.models[model_name] = AutoModelForCausalLM.from_pretrained(
                 config["model_name"],
                 device_map="auto",
                 torch_dtype=torch.float16,
-                attn_implementation="flash_attention_2" if "phi-3" in model_name else "eager",
                 trust_remote_code=True
             )
-    def stream_response(self, message, model_name):
         self.load_model(model_name)
         config = MODEL_CONFIG[model_name]
-        tokenizer = self.tokenizers[model_name]
-        model = self.models[model_name]
         prompt = config["template"].format(message=message)
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, timeout=60)
-        generation_kwargs = dict(
             **inputs,
-            streamer=streamer,
-            max_new_tokens=512,
             temperature=0.7,
             top_p=0.9,
-            repetition_penalty=1.1,
             do_sample=True,
-            pad_token_id=tokenizer.eos_token_id
         )
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        return streamer, tokenizer, time.time()
 model_handler = ChatModel()
 def chat(message, history, model_choice):
     try:
-        streamer, tokenizer, start_time = model_handler.stream_response(message, model_choice)
-        buffer = ""
-        for new_text in streamer:
-            buffer += new_text
-            yield [(message, buffer)]
-        elapsed_time = time.time() - start_time
-        tokens = len(tokenizer.encode(buffer))
-        token_speed = tokens / elapsed_time if elapsed_time > 0 else 0
-        final_response = f"{buffer}\n\n⏱️ {elapsed_time:.2f}s | 🚀 {token_speed:.2f} tokens/s"
-        yield [(message, final_response)]
     except Exception as e:
-        yield [(message, f"Error: {str(e)}")]
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🚀 Streaming LLM Chatbot (Fixed)")
     with gr.Row():
         model_choice = gr.Dropdown(
             choices=["phi-3", "llama3-8b"],

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import torch
 import time
     }
 }
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True
+)
 class ChatModel:
     def __init__(self):
         self.models = {}
         if model_name not in self.models:
             config = MODEL_CONFIG[model_name]
+            tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
+            tokenizer.pad_token = tokenizer.eos_token
+            model = AutoModelForCausalLM.from_pretrained(
                 config["model_name"],
+                quantization_config=bnb_config,
                 device_map="auto",
                 torch_dtype=torch.float16,
                 trust_remote_code=True
             )
+            self.models[model_name] = model
+            self.tokenizers[model_name] = tokenizer
+    def generate(self, message, model_name, history):
+        start_time = time.time()
         self.load_model(model_name)
         config = MODEL_CONFIG[model_name]
+        # Format prompt
         prompt = config["template"].format(message=message)
+        # Tokenize input
+        inputs = self.tokenizers[model_name](prompt, return_tensors="pt").to("cuda")
+        # Generate response
+        outputs = self.models[model_name].generate(
             **inputs,
+            max_new_tokens=384,
             temperature=0.7,
             top_p=0.9,
             do_sample=True,
+            pad_token_id=self.tokenizers[model_name].eos_token_id
         )
+        # Decode response
+        response = self.tokenizers[model_name].decode(
+            outputs[0][inputs.input_ids.shape[-1]:],
+            skip_special_tokens=True
+        ).strip()
+        # Calculate metrics
+        elapsed_time = time.time() - start_time
+        tokens = outputs[0].shape[0] - inputs.input_ids.shape[-1]
+        tokens_per_sec = tokens / elapsed_time if elapsed_time > 0 else 0
+        return response, elapsed_time, tokens_per_sec
 model_handler = ChatModel()
 def chat(message, history, model_choice):
     try:
+        response, response_time, token_speed = model_handler.generate(message, model_choice, history)
+        formatted_response = f"{response}\n\n⏱️ Response Time: {response_time:.2f}s | 🚀 Speed: {token_speed:.2f} tokens/s"
+        return [(message, formatted_response)]
     except Exception as e:
+        return [(message, f"Error: {str(e)}")]
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 LLM Chatbot with Performance Metrics")
     with gr.Row():
         model_choice = gr.Dropdown(
             choices=["phi-3", "llama3-8b"],