test3 / app.py
MuangMuangE's picture
Create app.py
e2f81da verified
import gradio as gr
from llama_cpp import Llama
# โœ๏ธ ์•„๋ž˜ ๋‘ ์ค„์„ ๋ณธ์ธ์˜ ๋ชจ๋ธ ์ •๋ณด๋กœ ๋ณ€๊ฒฝํ•˜์„ธ์š”
# ํŒŒ์ธํŠœ๋‹ ๋ชจ๋ธ: "YOUR_USERNAME/qwen3-4b-ft-gguf"
# ๊ณต๊ฐœ ๋ชจ๋ธ ์˜ˆ์‹œ: "unsloth/Qwen3-0.6B-GGUF" (ํ…Œ์ŠคํŠธ์šฉ)
REPO_ID = "unsloth/Qwen3-4B-GGUF" # โœ๏ธ HF Hub ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ID
# REPO_ID = "MuangMuangE/Qwen3-4B-GGUF" # โœ๏ธ HF Hub ๋ฆฌํฌ์ง€ํ† ๋ฆฌ ID
FILENAME = "Qwen3-4B-Q4_K_M.gguf" # โœ๏ธ ํŒŒ์ผ๋ช… (.gguf ํ™•์žฅ์ž ํ•„์ˆ˜)
# โœ๏ธ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ โ€” ์ฑ—๋ด‡์˜ ์—ญํ• ๊ณผ ๋งํˆฌ๋ฅผ ์ •์˜ํ•ฉ๋‹ˆ๋‹ค
SYSTEM_PROMPT = "๋‹น์‹ ์€ ์นœ์ ˆํ•œ ํ•œ๊ตญ์–ด AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."
# โš ๏ธ ๋ชจ๋ธ์„ ์•ฑ ์‹œ์ž‘ ์‹œ ๋ฐ”๋กœ ๋กœ๋”ฉํ•˜๋ฉด HF Spaces ํ—ฌ์Šค์ฒดํฌ(30๋ถ„) ํƒ€์ž„์•„์›ƒ ๋ฐœ์ƒ
# โ†’ ํ•ด๊ฒฐ: ๋ชจ๋ธ ๋กœ๋”ฉ์„ ์ฒซ ๋ฒˆ์งธ ์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€ ์‹œ์ ์œผ๋กœ ์ง€์—ฐ (Lazy Loading)
llm = None
def get_model():
"""
์ฒซ ํ˜ธ์ถœ ์‹œ์—๋งŒ ๋ชจ๋ธ์„ ๋‹ค์šด๋กœ๋“œ + ๋กœ๋”ฉํ•ฉ๋‹ˆ๋‹ค.
์ดํ›„ ํ˜ธ์ถœ์—์„œ๋Š” ์ด๋ฏธ ๋กœ๋”ฉ๋œ ๋ชจ๋ธ์„ ์žฌ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
- from_pretrained: hf_hub_download + Llama ์ดˆ๊ธฐํ™”๋ฅผ ํ•œ ์ค„๋กœ ์ฒ˜๋ฆฌ
- n_ctx: ์ปจํ…์ŠคํŠธ ๊ธธ์ด (๋ฉ”๋ชจ๋ฆฌ ํ™•๋ณด๋ฅผ ์œ„ํ•ด ์ž‘๊ฒŒ ์„ค์ •)
- n_threads: CPU Basic = 2 vCPU์— ๋งž์ถค
"""
global llm
if llm is None:
llm = Llama.from_pretrained(
repo_id=REPO_ID,
filename=FILENAME,
n_ctx=2048, # โœ๏ธ ์ปจํ…์ŠคํŠธ ๊ธธ์ด (๋ชจ๋ธ ์ตœ๋Œ€: 40960, ๋ฉ”๋ชจ๋ฆฌ ์ ˆ์•ฝ ์œ„ํ•ด 2048)
n_threads=2, # โœ๏ธ CPU Basic = 2 vCPU
verbose=False, # ๋กœ๋”ฉ ์‹œ ์ƒ์„ธ ๋กœ๊ทธ ์ˆจ๊น€
)
return llm
def respond(message, history):
"""
์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€๋ฅผ ๋ฐ›์•„ ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฐฉ์‹์œผ๋กœ ์‘๋‹ต์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
- message: ํ˜„์žฌ ์‚ฌ์šฉ์ž ์ž…๋ ฅ
- history: ์ด์ „ ๋Œ€ํ™” ๋‚ด์—ญ (Gradio ChatInterface๊ฐ€ ์ž๋™ ๊ด€๋ฆฌ)
"""
# ์ฒซ ํ˜ธ์ถœ ์‹œ ๋ชจ๋ธ ๋กœ๋”ฉ (1~2๋ถ„ ์†Œ์š”๋  ์ˆ˜ ์žˆ์Œ)
model = get_model()
# ๋Œ€ํ™” ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ: ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ + ์ด์ „ ๋Œ€ํ™” + ํ˜„์žฌ ์ž…๋ ฅ
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for msg in history:
messages.append(msg)
messages.append({"role": "user", "content": message})
# ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฐฉ์‹์œผ๋กœ ํ† ํฐ์„ ํ•˜๋‚˜์”ฉ ์ƒ์„ฑํ•˜์—ฌ ์‹ค์‹œ๊ฐ„ ์ถœ๋ ฅ
response = ""
for chunk in model.create_chat_completion(
messages=messages,
temperature=0.7, # โœ๏ธ ์ฐฝ์˜์„ฑ ์กฐ์ ˆ (0.0=๊ฒฐ์ •์ , 1.0=์ฐฝ์˜์ )
max_tokens=512, # โœ๏ธ ์ตœ๋Œ€ ์‘๋‹ต ๊ธธ์ด
stream=True, # ์ŠคํŠธ๋ฆฌ๋ฐ ํ™œ์„ฑํ™”
):
delta = chunk["choices"][0]["delta"].get("content", "")
response += delta
yield response # Gradio์— ์‹ค์‹œ๊ฐ„์œผ๋กœ ์ „๋‹ฌ
# โœ๏ธ Gradio ChatInterface: ์ฑ—๋ด‡ UI๋ฅผ ์ž๋™์œผ๋กœ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค
demo = gr.ChatInterface(
fn=respond,
title="Qwen3 GGUF ์ฑ—๋ด‡", # โœ๏ธ ์ œ๋ชฉ
description="์ฒซ ์‘๋‹ต ์‹œ ๋ชจ๋ธ์„ ๋กœ๋”ฉํ•ฉ๋‹ˆ๋‹ค (1~2๋ถ„ ์†Œ์š”)", # โœ๏ธ ์„ค๋ช…
examples=["์•ˆ๋…•ํ•˜์„ธ์š”!", "ํŒŒ์ด์ฌ์ด๋ž€ ๋ฌด์—‡์ธ๊ฐ€์š”?"], # โœ๏ธ ์˜ˆ์‹œ ์งˆ๋ฌธ
)
# server_name="0.0.0.0": ์™ธ๋ถ€ ์ ‘์† ํ—ˆ์šฉ (Docker ํ•„์ˆ˜)
# server_port=7860: HF Spaces ๊ธฐ๋ณธ ํฌํŠธ
demo.launch(server_name="0.0.0.0", server_port=7860)