import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from threading import Thread import spaces # 模型路径保持不变 model_id = "dealignai/Gemma-4-31B-JANG_4M-CRACK" @spaces.GPU(duration=120) # 指定 GPU 任务时长,帮助 ZeroGPU 更好调度 def chat(message, history): # 1. 加载 Tokenizer tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) # 2. 加载 Model (修正了之前的参数错误) # 注意:如果 ZeroGPU 显存不足,这里依然可能报 Out of Memory model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True ) # 3. 准备输入 inputs = tokenizer(message, return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(tokenizer, timeout=20., skip_prompt=True, skip_special_tokens=True) # 4. 设置生成参数 generate_kwargs = dict( inputs, streamer=streamer, max_new_tokens=1024, do_sample=True, top_p=0.9, temperature=0.6, ) # 5. 开启多线程生成 t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() # 6. 实时流式输出 partial_message = "" for new_token in streamer: partial_message += new_token yield partial_message # 7. 构建界面 demo = gr.ChatInterface( fn=chat, title="Gemma 4 31B 数学逻辑推导站", description="正在运行 31B 参数级别的重型模型,由于文件巨大,首句响应可能需要等待 1-2 分钟。" ) # 8. 启动服务 (针对 Hugging Face 环境优化) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", show_error=True)