Spaces:

Staticaliza
/

Voice

Paused

App Files Files Community

Staticaliza commited on Nov 1, 2024

Commit

3a7347e

verified ·

1 Parent(s): b3bce7b

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -32

app.py CHANGED Viewed

@@ -1,20 +1,12 @@
-import spaces
-import gradio as gr
-from transformers import AutoTokenizer
-from auto_gptq import AutoGPTQForCausalLM
-# Model identifier
-model_id = "xmadai/Mistral-Large-Instruct-2407-xMADai-INT4"
-# Load the tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, trust_remote_code=True)
-# Removed the invalid decorator
 class ModelWrapper:
     def __init__(self):
         self.model = None  # Model will be loaded when GPU is allocated
-    @spaces.GPU  # Use the correct decorator
     def generate(self, prompt):
         if self.model is None:
             # Load the model when GPU is allocated
@@ -23,32 +15,28 @@ class ModelWrapper:
                 device_map='auto',
                 trust_remote_code=True,
             )
         # Tokenize the input prompt
         inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
-        # Generate text
-        outputs = self.model.generate(
             **inputs,
             do_sample=True,
-            max_new_tokens=512
         )
-        # Decode the generated text
-        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return generated_text
-# Instantiate the model wrapper
-model_wrapper = ModelWrapper()
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=model_wrapper.generate,
-    inputs=gr.Textbox(lines=5, label="Input Prompt"),
-    outputs=gr.Textbox(label="Generated Text"),
-    title="Mistral-Large-Instruct-2407 Text Completion",
-    description="Enter a prompt and receive a text completion using the Mistral-Large-Instruct-2407 INT4 model."
-)
-if __name__ == "__main__":
-    interface.launch()

+import torch
+from transformers import TextIteratorStreamer
+import threading
 class ModelWrapper:
     def __init__(self):
         self.model = None  # Model will be loaded when GPU is allocated
+    @spaces.GPU
     def generate(self, prompt):
         if self.model is None:
             # Load the model when GPU is allocated
                 device_map='auto',
                 trust_remote_code=True,
             )
+            self.model.eval()
         # Tokenize the input prompt
         inputs = tokenizer(prompt, return_tensors='pt').to('cuda')
+        # Set up the streamer
+        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+        # Prepare generation arguments
+        generation_kwargs = dict(
             **inputs,
+            streamer=streamer,
             do_sample=True,
+            max_new_tokens=512,
         )
+        # Start generation in a separate thread to enable streaming
+        thread = threading.Thread(target=self.model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Yield generated text in real-time
+        generated_text = ""
+        for new_text in streamer:
+            generated_text += new_text
+            yield generated_text