"""
huggingface_hub==0.30.1
transformers==4.48.2
# gradio==5.0.1
gradio==5.23.2
torch==2.5.1
pydantic==2.8.2
"""

import gradio as gr
print("Gradio version:", gr.__version__)
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import torch
from threading import Thread

# import os; os.chdir(os.path.dirname(__file__))

model_name = "fzmnm/TinyLili-zh-64M"


max_tokens=4096
max_new_tokens=1024
temperature=0.7
top_p=0.95

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

model.generation_config.pad_token_id = tokenizer.eos_token_id


def build_input_str(message: str, history: 'list[list[str]]'):
    history = history + [{'role': 'user', 'content': message}]
    input_str = tokenizer.apply_chat_template(history, tokenize=False)
    input_str += '\n<|im_start|>assistant\n'
    return input_str

def stop_criteria(input_str):
    end_tokens=['<s>','<|im_end|>']
    return any(input_str.endswith(end_token) for end_token in end_tokens)

def remove_ending(input_str):
    if input_str.endswith("<|im_end|>"):
        return input_str[:-10]
    return input_str

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        input_str = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        return stop_criteria(input_str)


def chat(message, history, temperature):
    input_str = build_input_str(message, history)
    input_ids = tokenizer.encode(input_str, return_tensors="pt")
    input_ids = input_ids[:, -max_tokens:]
    streamer = TextIteratorStreamer(
        tokenizer, 
        timeout=10, 
        skip_prompt=True, 
        skip_special_tokens=True)
    stopping_criteria = StoppingCriteriaList([StopOnTokens()])
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        stopping_criteria=stopping_criteria,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        do_sample=True,
        temperature=float(temperature),
    )
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    try:
        output_str = ""
        for new_str in streamer:
            output_str += new_str
            yield remove_ending(output_str)
        t.join()
    finally:
        if t.is_alive():
            print('Canceling thread...')
            t.join(timeout=1)
            if t.is_alive():
                raise RuntimeError("Thread did not terminate properly.")

example_strs=[
    '北京有什么好玩的? ', 
    '土星上有什么好吃的', 
    '什么是黑洞？',
    '一个人的目的是否必须要被社会认可？', 
    '奶奶今年八十岁了，可她还是坚持一个人住乡下，说那是她的根。我们全家都劝她搬来城市，可她总说“住得舒服，比啥都重要”。但她上个月摔了一跤，脚还没完全好，万一再出事怎么办？她那么倔，我们还能怎么劝呢？', 
]

app = gr.ChatInterface(
    fn=chat, 
    type='messages',
    examples=[[s,temperature] for s in example_strs],
    title='聊天机器人',
    stop_btn=True,
    # run_examples_on_click=False, # there is a bug with example questions that it does not toggle stop_btn on. toggling this option can circumvent this issue. however, it is not supported in 5.0.1
    additional_inputs=[
        gr.Slider(minimum=0.1, maximum=4.0, value=temperature, step=0.05, label='Temperature'),
    ],
    cache_examples=False,
)

app.queue()

if __name__ == "__main__":
    app.launch()