OzTianlu's picture
Update app.py
1064c57 verified
import spaces
import torch
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import gradio as gr
MODEL_ID = "NoesisLab/Kai-30B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
"NoesisLab/Kai-30B-Instruct",
)
@spaces.GPU
def respond(message, history):
msg = """You are Kai, a helpful assistant.
You are a logical assistant that follows a strict "Reason-then-Act" process. For every query, you must structure your response into two distinct sections:
1. ### Reasoning Process
- Break down the user's request into smaller parts.
- Check for potential pitfalls or edge cases.
- Draft a step-by-step plan to solve the problem.
- Verify your logic before moving to the final answer.
2. ### Final Answer
- Provide the concise and direct result based on the reasoning above.
- Do not repeat the reasoning; just provide the output.
Strictly follow this format for every response. Begin your thought process now."""
messages = [{"role": "system", "content": msg}]
for msg in history:
messages.append({"role": msg["role"], "content": msg["content"]})
messages.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
temperature=0.6,
top_p=0.95,
do_sample=True,
)
thread = Thread(target=model.generate, kwargs=generate_kwargs)
thread.start()
response = ""
for token in streamer:
response += token
yield response
demo = gr.ChatInterface(
fn=respond,
title="Chat with Kai-30B-Instruct",
description="Chat with NoesisLab/Kai-30B-Instruct",
)
if __name__ == "__main__":
demo.launch()