File size: 2,344 Bytes
88fc169
 
 
 
 
 
c9b16c8
88fc169
 
c9b16c8
 
88fc169
 
c9b16c8
 
88fc169
 
3e93048
 
 
 
 
 
 
 
88fc169
c9b16c8
 
 
88fc169
c9b16c8
88fc169
c9b16c8
88fc169
 
 
3e93048
 
 
88fc169
3e93048
88fc169
c9b16c8
88fc169
 
 
 
 
 
 
 
 
 
 
c9b16c8
 
3e93048
c9b16c8
 
 
3e93048
 
88fc169
 
 
eb0271e
c9b16c8
eb0271e
 
 
 
 
 
 
 
 
 
88fc169
eb0271e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import gradio as gr
import copy
from llama_cpp import Llama
from huggingface_hub import hf_hub_download  

# إعداد الموديل (تم تثبيت Qwen مباشرة لتجنب الأخطاء)
llm = Llama(
    model_path=hf_hub_download(
        repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF",
        filename="qwen2.5-1.5b-instruct-q4_k_m.gguf",
    ),
    n_ctx=2048,
    n_gpu_layers=0, # تم جعله 0 ليعمل باستقرار على CPU
    verbose=False
) 

def generate_text(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    temp = ""
    
    # تعديل صيغة البرومبت لتناسب Qwen (ChatML Format)
    input_prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n"
    for interaction in history:
        input_prompt += f"<|im_start|>user\n{interaction[0]}<|im_end|>\n<|im_start|>assistant\n{interaction[1]}<|im_end|>\n"

    input_prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    output = llm(
        input_prompt,
        temperature=temperature,
        top_p=top_p,
        top_k=40,
        repeat_penalty=1.1,
        max_tokens=max_tokens,
        stop=[
            "<|im_end|>",
            "<|endoftext|>",
        ],
        stream=True,
    )
    for out in output:
        stream = copy.deepcopy(out)
        temp += stream["choices"][0]["text"]
        yield temp

demo = gr.ChatInterface(
    generate_text,
    title="Qwen 2.5 (1.5B) - Fast Server",
    description="Running Qwen 2.5 on CPU via llama.cpp",
    examples=[
        ['Hello, introduce yourself.'],
        ['Explain quantum physics simply.'],
        ['Write a python code to sum two numbers.']
    ],
    cache_examples=False,
    retry_btn=None,
    undo_btn="Delete Previous",
    clear_btn="Clear",
    additional_inputs=[
        gr.Textbox(value="You are a helpful AI assistant.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)

if __name__ == "__main__":
    demo.launch()