import spaces import torch from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import gradio as gr MODEL_ID = "NoesisLab/Kai-30B-Instruct" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( "NoesisLab/Kai-30B-Instruct", ) @spaces.GPU def respond(message, history): msg = """You are Kai, a helpful assistant. You are a logical assistant that follows a strict "Reason-then-Act" process. For every query, you must structure your response into two distinct sections: 1. ### Reasoning Process - Break down the user's request into smaller parts. - Check for potential pitfalls or edge cases. - Draft a step-by-step plan to solve the problem. - Verify your logic before moving to the final answer. 2. ### Final Answer - Provide the concise and direct result based on the reasoning above. - Do not repeat the reasoning; just provide the output. Strictly follow this format for every response. Begin your thought process now.""" messages = [{"role": "system", "content": msg}] for msg in history: messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt" ).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=input_ids, streamer=streamer, temperature=0.6, top_p=0.95, do_sample=True, ) thread = Thread(target=model.generate, kwargs=generate_kwargs) thread.start() response = "" for token in streamer: response += token yield response demo = gr.ChatInterface( fn=respond, title="Chat with Kai-30B-Instruct", description="Chat with NoesisLab/Kai-30B-Instruct", ) if __name__ == "__main__": demo.launch()