File size: 3,280 Bytes
e2f81da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
from llama_cpp import Llama

# โœ๏ธ ์•„๋ž˜ ๋‘ ์ค„์„ ๋ณธ์ธ์˜ ๋ชจ๋ธ ์ •๋ณด๋กœ ๋ณ€๊ฒฝํ•˜์„ธ์š”
# ํŒŒ์ธํŠœ๋‹ ๋ชจ๋ธ: "YOUR_USERNAME/qwen3-4b-ft-gguf"
# ๊ณต๊ฐœ ๋ชจ๋ธ ์˜ˆ์‹œ: "unsloth/Qwen3-0.6B-GGUF" (ํ…Œ์ŠคํŠธ์šฉ)
REPO_ID = "unsloth/Qwen3-4B-GGUF"   # โœ๏ธ HF Hub ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ID
# REPO_ID = "MuangMuangE/Qwen3-4B-GGUF"   # โœ๏ธ HF Hub ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ID
FILENAME = "Qwen3-4B-Q4_K_M.gguf"     # โœ๏ธ ํŒŒ์ผ๋ช… (.gguf ํ™•์žฅ์ž ํ•„์ˆ˜)

# โœ๏ธ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ โ€” ์ฑ—๋ด‡์˜ ์—ญํ• ๊ณผ ๋งํˆฌ๋ฅผ ์ •์˜ํ•ฉ๋‹ˆ๋‹ค
SYSTEM_PROMPT = "๋‹น์‹ ์€ ์นœ์ ˆํ•œ ํ•œ๊ตญ์–ด AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."

# โš ๏ธ ๋ชจ๋ธ์„ ์•ฑ ์‹œ์ž‘ ์‹œ ๋ฐ”๋กœ ๋กœ๋”ฉํ•˜๋ฉด HF Spaces ํ—ฌ์Šค์ฒดํฌ(30๋ถ„) ํƒ€์ž„์•„์›ƒ ๋ฐœ์ƒ
# โ†’ ํ•ด๊ฒฐ: ๋ชจ๋ธ ๋กœ๋”ฉ์„ ์ฒซ ๋ฒˆ์งธ ์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€ ์‹œ์ ์œผ๋กœ ์ง€์—ฐ (Lazy Loading)
llm = None

def get_model():
    """
    ์ฒซ ํ˜ธ์ถœ ์‹œ์—๋งŒ ๋ชจ๋ธ์„ ๋‹ค์šด๋กœ๋“œ + ๋กœ๋”ฉํ•ฉ๋‹ˆ๋‹ค.
    ์ดํ›„ ํ˜ธ์ถœ์—์„œ๋Š” ์ด๋ฏธ ๋กœ๋”ฉ๋œ ๋ชจ๋ธ์„ ์žฌ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
    - from_pretrained: hf_hub_download + Llama ์ดˆ๊ธฐํ™”๋ฅผ ํ•œ ์ค„๋กœ ์ฒ˜๋ฆฌ
    - n_ctx: ์ปจํ…์ŠคํŠธ ๊ธธ์ด (๋ฉ”๋ชจ๋ฆฌ ํ™•๋ณด๋ฅผ ์œ„ํ•ด ์ž‘๊ฒŒ ์„ค์ •)
    - n_threads: CPU Basic = 2 vCPU์— ๋งž์ถค
    """
    global llm
    if llm is None:
        llm = Llama.from_pretrained(
            repo_id=REPO_ID,
            filename=FILENAME,
            n_ctx=2048,       # โœ๏ธ ์ปจํ…์ŠคํŠธ ๊ธธ์ด (๋ชจ๋ธ ์ตœ๋Œ€: 40960, ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ ์œ„ํ•ด 2048)
            n_threads=2,      # โœ๏ธ CPU Basic = 2 vCPU
            verbose=False,    # ๋กœ๋”ฉ ์‹œ ์ƒ์„ธ ๋กœ๊ทธ ์ˆจ๊น€
        )
    return llm

def respond(message, history):
    """
    ์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€๋ฅผ ๋ฐ›์•„ ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฐฉ์‹์œผ๋กœ ์‘๋‹ต์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
    - message: ํ˜„์žฌ ์‚ฌ์šฉ์ž ์ž…๋ ฅ
    - history: ์ด์ „ ๋Œ€ํ™” ๋‚ด์—ญ (Gradio ChatInterface๊ฐ€ ์ž๋™ ๊ด€๋ฆฌ)
    """
    # ์ฒซ ํ˜ธ์ถœ ์‹œ ๋ชจ๋ธ ๋กœ๋”ฉ (1~2๋ถ„ ์†Œ์š”๋  ์ˆ˜ ์žˆ์Œ)
    model = get_model()

    # ๋Œ€ํ™” ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ: ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ + ์ด์ „ ๋Œ€ํ™” + ํ˜„์žฌ ์ž…๋ ฅ
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for msg in history:
        messages.append(msg)
    messages.append({"role": "user", "content": message})

    # ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฐฉ์‹์œผ๋กœ ํ† ํฐ์„ ํ•˜๋‚˜์”ฉ ์ƒ์„ฑํ•˜์—ฌ ์‹ค์‹œ๊ฐ„ ์ถœ๋ ฅ
    response = ""
    for chunk in model.create_chat_completion(
        messages=messages,
        temperature=0.7,   # โœ๏ธ ์ฐฝ์˜์„ฑ ์กฐ์ ˆ (0.0=๊ฒฐ์ •์ , 1.0=์ฐฝ์˜์ )
        max_tokens=512,    # โœ๏ธ ์ตœ๋Œ€ ์‘๋‹ต ๊ธธ์ด
        stream=True,       # ์ŠคํŠธ๋ฆฌ๋ฐ ํ™œ์„ฑํ™”
    ):
        delta = chunk["choices"][0]["delta"].get("content", "")
        response += delta
        yield response     # Gradio์— ์‹ค์‹œ๊ฐ„์œผ๋กœ ์ „๋‹ฌ

# โœ๏ธ Gradio ChatInterface: ์ฑ—๋ด‡ UI๋ฅผ ์ž๋™์œผ๋กœ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค
demo = gr.ChatInterface(
    fn=respond,
    title="Qwen3 GGUF ์ฑ—๋ด‡",                          # โœ๏ธ ์ œ๋ชฉ
    description="์ฒซ ์‘๋‹ต ์‹œ ๋ชจ๋ธ์„ ๋กœ๋”ฉํ•ฉ๋‹ˆ๋‹ค (1~2๋ถ„ ์†Œ์š”)",   # โœ๏ธ ์„ค๋ช…
    examples=["์•ˆ๋…•ํ•˜์„ธ์š”!", "ํŒŒ์ด์ฌ์ด๋ž€ ๋ฌด์—‡์ธ๊ฐ€์š”?"],         # โœ๏ธ ์˜ˆ์‹œ ์งˆ๋ฌธ
)

# server_name="0.0.0.0": ์™ธ๋ถ€ ์ ‘์† ํ—ˆ์šฉ (Docker ํ•„์ˆ˜)
# server_port=7860: HF Spaces ๊ธฐ๋ณธ ํฌํŠธ
demo.launch(server_name="0.0.0.0", server_port=7860)