File size: 2,495 Bytes
948874b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os

# Load model and tokenizer from Hugging Face Hub
# This assumes your model is uploaded to your-username/my-qwen-model
# and that MODEL_ID is set correctly either as an env var or hardcoded.
# For Hugging Face Spaces, your repo_id is usually inferred.

# You can also hardcode your model ID if you prefer:
# MODEL_ID = "your-username/my-qwen-2.5-3b-instruct"
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen2.5-3B-Instruct")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

def hf_chat(system_prompt: str, user_text: str, max_tokens: int = 220) -> str:
    prompt = f'''<|system|>
{system_prompt.strip()}
<|user|>
{user_text.strip()}
<|assistant|>
'''

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            do_sample=False,
            temperature=0.0,
            use_cache=True
        )

    generated_ids = output_ids[0][len(inputs.input_ids[0]):]
    text = tokenizer.decode(generated_ids, skip_special_tokens=True)

    for token in ["<|assistant|>", "<|user|>", "<|system|>", "<|im_end|>", "<|im_start|>" ]:
        if token in text:
            text = text.split(token)[0].strip()

    return text.strip()


def predict(message, history, system_prompt_input):
    # Reconstruct chat history for the model if needed, or just use current message
    # For this simple example, we'll only use the current message and system prompt
    response = hf_chat(system_prompt_input, message)
    return response


with gr.Blocks() as demo:
    gr.Markdown("# MezayaAI Qwen2.5-3B-Instruct Demo")
    system_prompt_input = gr.Textbox(label="System Prompt", value="You are a helpful AI assistant.", lines=2)
    chatbot = gr.ChatInterface(
        predict,
        chatbot=gr.Chatbot(height=300),
        textbox=gr.Textbox(placeholder="Ask me a question", container=False, scale=7),
        # examples=["What is the capital of France?", "Explain quantum physics."],
        title="Qwen2.5-3B-Instruct Chat",
        description="Ask Qwen2.5-3B-Instruct anything!",
        theme="soft",
    )

if __name__ == "__main__":
    demo.launch(debug=True)