import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
import spaces

# 模型路径保持不变
model_id = "dealignai/Gemma-4-31B-JANG_4M-CRACK"

@spaces.GPU(duration=120)  # 指定 GPU 任务时长，帮助 ZeroGPU 更好调度
def chat(message, history):
    # 1. 加载 Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    # 2. 加载 Model (修正了之前的参数错误)
    # 注意：如果 ZeroGPU 显存不足，这里依然可能报 Out of Memory
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    
    # 3. 准备输入
    inputs = tokenizer(message, return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, timeout=20., skip_prompt=True, skip_special_tokens=True)
    
    # 4. 设置生成参数
    generate_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.9,
        temperature=0.6,
    )
    
    # 5. 开启多线程生成
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    
    # 6. 实时流式输出
    partial_message = ""
    for new_token in streamer:
        partial_message += new_token
        yield partial_message

# 7. 构建界面
demo = gr.ChatInterface(
    fn=chat, 
    title="Gemma 4 31B 数学逻辑推导站",
    description="正在运行 31B 参数级别的重型模型，由于文件巨大，首句响应可能需要等待 1-2 分钟。"
)

# 8. 启动服务 (针对 Hugging Face 环境优化)
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", show_error=True)