gemini4 / app.py
togoice's picture
Update app.py
714cb34 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import spaces
# 模型路径保持不变
model_id = "dealignai/Gemma-4-31B-JANG_4M-CRACK"
@spaces.GPU(duration=120) # 指定 GPU 任务时长,帮助 ZeroGPU 更好调度
def chat(message, history):
# 1. 加载 Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# 2. 加载 Model (修正了之前的参数错误)
# 注意:如果 ZeroGPU 显存不足,这里依然可能报 Out of Memory
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True
)
# 3. 准备输入
inputs = tokenizer(message, return_tensors="pt").to("cuda")
streamer = TextIteratorStreamer(tokenizer, timeout=20., skip_prompt=True, skip_special_tokens=True)
# 4. 设置生成参数
generate_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=True,
top_p=0.9,
temperature=0.6,
)
# 5. 开启多线程生成
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
# 6. 实时流式输出
partial_message = ""
for new_token in streamer:
partial_message += new_token
yield partial_message
# 7. 构建界面
demo = gr.ChatInterface(
fn=chat,
title="Gemma 4 31B 数学逻辑推导站",
description="正在运行 31B 参数级别的重型模型,由于文件巨大,首句响应可能需要等待 1-2 分钟。"
)
# 8. 启动服务 (针对 Hugging Face 环境优化)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", show_error=True)