| import gradio as gr |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| from threading import Thread |
| import spaces |
|
|
| |
| model_id = "dealignai/Gemma-4-31B-JANG_4M-CRACK" |
|
|
| @spaces.GPU(duration=120) |
| def chat(message, history): |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
| |
| |
| |
| model = AutoModelForCausalLM.from_pretrained( |
| model_id, |
| device_map="auto", |
| torch_dtype=torch.bfloat16, |
| low_cpu_mem_usage=True, |
| trust_remote_code=True |
| ) |
| |
| |
| inputs = tokenizer(message, return_tensors="pt").to("cuda") |
| streamer = TextIteratorStreamer(tokenizer, timeout=20., skip_prompt=True, skip_special_tokens=True) |
| |
| |
| generate_kwargs = dict( |
| inputs, |
| streamer=streamer, |
| max_new_tokens=1024, |
| do_sample=True, |
| top_p=0.9, |
| temperature=0.6, |
| ) |
| |
| |
| t = Thread(target=model.generate, kwargs=generate_kwargs) |
| t.start() |
| |
| |
| partial_message = "" |
| for new_token in streamer: |
| partial_message += new_token |
| yield partial_message |
|
|
| |
| demo = gr.ChatInterface( |
| fn=chat, |
| title="Gemma 4 31B 数学逻辑推导站", |
| description="正在运行 31B 参数级别的重型模型,由于文件巨大,首句响应可能需要等待 1-2 分钟。" |
| ) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch(server_name="0.0.0.0", show_error=True) |