import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import spaces # Model config MODEL_ID = "WeiboAI/VibeThinker-1.5B" SYSTEM_PROMPT = "You are a concise solver. Respond briefly." # Load model print("Loading model...") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto", ) print("Model loaded!") @spaces.GPU def chat_with_stream(message, history, progress=gr.Progress()): """Chat with streaming output""" # Handle inputs safely if message is None: message = "Hello" if history is None: history = [] # Convert to string message = str(message) progress(0.1, desc="Building conversation...") # Build messages messages = [{"role": "system", "content": SYSTEM_PROMPT}] # Add history for user_msg, assistant_msg in history: if user_msg is not None: messages.append({"role": "user", "content": str(user_msg)}) if assistant_msg is not None: messages.append({"role": "assistant", "content": str(assistant_msg)}) progress(0.3, desc="Adding your message...") messages.append({"role": "user", "content": message}) progress(0.5, desc="Formatting input...") prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) progress(0.6, desc="Tokenizing...") inputs = tokenizer([prompt], return_tensors="pt").to(model.device) progress(0.7, desc="Starting generation...") # Generate with streaming with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=1000, do_sample=True, temperature=0.7, top_p=0.9, pad_token_id=tokenizer.eos_token_id, return_dict_in_generate=True, output_scores=False, ) progress(0.9, desc="Decoding response...") # Decode full_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant response if "assistant" in full_text: response = full_text.split("assistant")[-1].strip() else: response = full_text progress(1.0, desc="Complete!") return response def create_demo(): """Create simple demo""" demo = gr.ChatInterface( fn=chat_with_stream, title="VibeThinker Chat", description="Simple chat with VibeThinker-1.5B", examples=["2+2", "What is AI?", "Write a poem"] ) return demo if __name__ == "__main__": print("Starting...") demo = create_demo() demo.launch(share=False)