Spaces:

customgpt
/

llm

Runtime error

App Files Files Community

RamishRasool14 commited on Feb 25, 2025

Commit

1da696e

1 Parent(s): 2ca9a16

chatbot

Browse files

Files changed (1) hide show

app.py +74 -27

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
 from unsloth import FastLanguageModel
-from transformers import TextStreamer
-import torch
 # Load your fine-tuned model and tokenizer
 model, tokenizer = FastLanguageModel.from_pretrained(
@@ -10,29 +10,62 @@ model, tokenizer = FastLanguageModel.from_pretrained(
     dtype='bf16',
     load_in_4bit=False,
 )
-text_streamer = TextStreamer(tokenizer)
 FastLanguageModel.for_inference(model)  # Enable optimized inference
-def predict(question, history):
-    history = history or []
-    history.append({"from": "human", "value": question})
     inputs = tokenizer(
-        tokenizer.apply_chat_template(
-            history,
-            tokenize=False,
-            add_generation_prompt=True
-        ),
-        return_tensors="pt"
-    ).to("cuda" if torch.cuda.is_available() else "cpu")
-    token = model.generate(**inputs, max_new_tokens=8192, streamer = text_streamer)
-    output = tokenizer.decode(token, skip_special_tokens=True)
-    output = output.split("[/INST]")[-1].strip()
-    yield output
-    history.append({"from": "gpt", "value": output})
-    return history
 # Create the Gradio interface with Markdown support
 with gr.Blocks(css=".message { white-space: pre-wrap; }") as iface:
@@ -41,30 +74,44 @@ with gr.Blocks(css=".message { white-space: pre-wrap; }") as iface:
         container=True,
         height=600,
         bubble_full_width=False,
-        render_markdown=True,  # Enable markdown rendering
-        latex_delimiters=[  # Optional: Enable LaTeX rendering
             {"left": "$$", "right": "$$", "display": True},
             {"left": "$", "right": "$", "display": False},
         ],
     )
-    submit = gr.Button("Submit")
     msg = gr.Textbox(
         label="Message",
         placeholder="Type your message here... (Markdown supported)",
         lines=2
     )
     clear = gr.Button("Clear")
-    state = gr.State([])
     submit.click(
         predict,
-        [msg, state],
-        [chatbot, state],
-        api_name="predict"
     )
-    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
 from unsloth import FastLanguageModel
+from transformers import TextIteratorStreamer
+from threading import Thread
 # Load your fine-tuned model and tokenizer
 model, tokenizer = FastLanguageModel.from_pretrained(
     dtype='bf16',
     load_in_4bit=False,
 )
 FastLanguageModel.for_inference(model)  # Enable optimized inference
+def get_streaming_generator(model, tokenizer, history, max_new_tokens=8192):
+    """Function that returns a generator yielding streaming outputs"""
+    # Convert history to the format expected by tokenizer
+    formatted_history = []
+    for exchange in history:
+        formatted_history.append({"role": "user", "content": exchange[0]})
+        if len(exchange) > 1 and exchange[1]:
+            formatted_history.append({"role": "assistant", "content": exchange[1]})
     inputs = tokenizer(
+        [
+            tokenizer.apply_chat_template(formatted_history,
+                                        tokenize=False,
+                                        add_generation_prompt=True),
+        ],
+        return_tensors="pt",
+        padding=True,
+        return_attention_mask=True
+    ).to("cuda")
+    # Create the streamer
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
+    # Run generation in a separate thread
+    generation_kwargs = dict(
+        input_ids=inputs["input_ids"],
+        attention_mask=inputs["attention_mask"],
+        streamer=streamer,
+        max_new_tokens=max_new_tokens
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    return streamer
+def predict(message, history):
+    # Add user message to history in the format Gradio expects
+    history = history or []
+    history.append([message, ""])
+    # Get the streamer with properly formatted history
+    streamer = get_streaming_generator(model, tokenizer, history)
+    # Stream the response
+    full_response = ""
+    for text_chunk in streamer:
+        full_response += text_chunk
+        # Update the last message with the current full response
+        history[-1][1] = full_response
+        yield history
+def clear_chat():
+    return [], ""
 # Create the Gradio interface with Markdown support
 with gr.Blocks(css=".message { white-space: pre-wrap; }") as iface:
         container=True,
         height=600,
         bubble_full_width=False,
+        render_markdown=True,
+        latex_delimiters=[
             {"left": "$$", "right": "$$", "display": True},
             {"left": "$", "right": "$", "display": False},
         ],
     )
     msg = gr.Textbox(
         label="Message",
         placeholder="Type your message here... (Markdown supported)",
         lines=2
     )
+    submit = gr.Button("Submit")
     clear = gr.Button("Clear")
+    # Set up the chat interface with streaming
+    msg.submit(
+        predict,
+        [msg, chatbot],
+        [chatbot],
+        api_name="predict"
+    ).then(
+        lambda: "", None, [msg]  # Clear input after submission
+    )
     submit.click(
         predict,
+        [msg, chatbot],
+        [chatbot]
+    ).then(
+        lambda: "", None, [msg]  # Clear input after submission
     )
+    clear.click(
+        clear_chat,
+        None,
+        [chatbot, msg],
+        queue=False
+    )
 if __name__ == "__main__":
     iface.launch()