test-3 / app.py
puresoulwd's picture
Upload app.py
0e7bd0b verified
import inspect
import os
import threading
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-0.6B")
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "4096"))
MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "4096"))
MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "3"))
N_THREADS = int(os.getenv("N_THREADS", str(max(1, os.cpu_count() or 1))))
DEFAULT_SYSTEM_PROMPT = os.getenv(
"SYSTEM_PROMPT",
"๋‹น์‹ ์€ ์œ ์šฉํ•œ ํ•œ๊ตญ์–ด AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค. ๋ชจ๋“  ์ž์—ฐ์–ด ์‘๋‹ต์€ ๋ฐ˜๋“œ์‹œ ํ•œ๊ตญ์–ด๋กœ๋งŒ ์ž‘์„ฑํ•˜์„ธ์š”. ๋จผ์ € ์ฐจ๊ทผ์ฐจ๊ทผ ์ƒ๊ฐํ•œ ๋’ค, ์ตœ์ข… ๋‹ต๋ณ€์€ ๋ช…ํ™•ํ•œ ํ•œ๊ตญ์–ด๋กœ ์ž‘์„ฑํ•˜์„ธ์š”.",
)
BASE_THINKING_SUFFIX = (
"\n\nthinking ๋ชจ๋“œ๊ฐ€ ์ผœ์ ธ ์žˆ์œผ๋ฉด ๋ฐ˜๋“œ์‹œ ๋‘ ๋ถ€๋ถ„์„ ๋ชจ๋‘ ์™„์„ฑํ•˜์„ธ์š”: "
"(1) reasoning ์˜์—ญ์˜ ์ถ”๋ก  ๋‚ด์šฉ๊ณผ "
"(2) ์ถ”๋ก  ์ข…๋ฃŒ ํ›„ assistant ์˜์—ญ์˜ ์ตœ์ข… ๋‹ต๋ณ€. "
"์ถ”๋ก ๋งŒ ์ถœ๋ ฅํ•˜๊ณ  ๋๋‚ด์ง€ ๋งˆ์„ธ์š”. reasoning๊ณผ assistant์˜ ๋ชจ๋“  ์ž์—ฐ์–ด ๋ฌธ์žฅ์€ ๋ฐ˜๋“œ์‹œ ํ•œ๊ตญ์–ด๋กœ๋งŒ ์ž‘์„ฑํ•˜์„ธ์š”."
)
BASE_USER_SUFFIX_THINKING = (
"\n\n๋จผ์ € reasoning์„ ์ž‘์„ฑํ•˜๊ณ , ๊ทธ ๋‹ค์Œ assistant ์ตœ์ข… ๋‹ต๋ณ€์„ ๋ฐ˜๋“œ์‹œ ์ด์–ด์„œ ์ž‘์„ฑํ•˜์„ธ์š”. "
"reasoning๊ณผ ์ตœ์ข… ๋‹ต๋ณ€์˜ ์ž์—ฐ์–ด ๋ฌธ์žฅ์€ ๋ฐ˜๋“œ์‹œ ํ•œ๊ตญ์–ด๋งŒ ์‚ฌ์šฉํ•˜์„ธ์š”. ์ตœ์ข… ๋‹ต๋ณ€์€ ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ๋ถ„๋ช…ํ•œ ํ•œ๊ตญ์–ด๋กœ ํฌํ•จํ•˜์„ธ์š”."
)
PRESETS = {
"์ˆ˜ํ•™": {
"system": (
"๋‹น์‹ ์€ ๊ผผ๊ผผํ•œ ์ˆ˜ํ•™ ํŠœํ„ฐ์ž…๋‹ˆ๋‹ค. ๋ฌธ์ œ๋ฅผ ์ •ํ™•ํžˆ ํ’€์ดํ•˜์„ธ์š”. "
"์ถ”๋ก ์€ reasoning ํŒจ๋„์—, ์ตœ์ข… ๋‹ต๋ณ€์€ assistant ํŒจ๋„์— ํ•œ๊ตญ์–ด๋กœ๋งŒ ๊ฐ„๋‹จ๋ช…๋ฃŒํ•˜๊ฒŒ ์ž‘์„ฑํ•˜์„ธ์š”."
),
"prompt": "๋‹ค์Œ ์ด์ฐจ๋ฐฉ์ •์‹์„ ํ’€์ดํ•˜๊ณ  ๊ณ„์‚ฐ ๊ณผ์ •์„ reasoning์—, ์ตœ์ข… ๊ทผ์„ assistant์— ์ž‘์„ฑํ•˜์„ธ์š”: 2x^2 - 7x + 3 = 0.",
"thinking": True,
},
"์ฝ”๋”ฉ": {
"system": (
"๋‹น์‹ ์€ ํŒŒ์ด์ฌ ๋„์šฐ๋ฏธ์ž…๋‹ˆ๋‹ค. ์ฝ๊ธฐ ์‰ฝ๊ณ  ์ •ํ™•ํ•œ ์ฝ”๋“œ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”. "
"๊ณ„ํš์€ reasoning ํŒจ๋„์—, ์ตœ์ข… ์ฝ”๋“œ๋Š” assistant ํŒจ๋„์— ์ž‘์„ฑํ•˜๊ณ  ์„ค๋ช…์€ ํ•œ๊ตญ์–ด๋กœ๋งŒ ์ž‘์„ฑํ•˜์„ธ์š”."
),
"prompt": (
"์ •๋ ฌ๋œ ๋‘ ๋ฆฌ์ŠคํŠธ๋ฅผ ํ•˜๋‚˜์˜ ์ •๋ ฌ๋œ ๋ฆฌ์ŠคํŠธ๋กœ ํ•ฉ์น˜๋Š” "
"merge_sorted_lists(a, b) ํŒŒ์ด์ฌ ํ•จ์ˆ˜๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”. reasoning์—๋Š” ์ ‘๊ทผ ๋ฐฉ๋ฒ•์„, assistant์—๋Š” ์ตœ์ข… ์ฝ”๋“œ์™€ ์˜ˆ์‹œ ํ˜ธ์ถœ์„ ์ž‘์„ฑํ•˜์„ธ์š”."
),
"thinking": True,
},
"๊ตฌ์กฐํ™” ์ถœ๋ ฅ": {
"system": "assistant ์ตœ์ข… ๋‹ต๋ณ€์—๋Š” ๊ตฐ๋”๋”๊ธฐ ์—†์ด compact JSON๋งŒ ์ถœ๋ ฅํ•˜์„ธ์š”. JSON ๋ฐ”๊นฅ์˜ ์ž์—ฐ์–ด ์„ค๋ช…์€ ์“ฐ์ง€ ๋งˆ์„ธ์š”.",
"prompt": "๋‹ค์Œ ๋ฉ”๋ชจ์—์„œ ํ•„์š”ํ•œ ์ •๋ณด๋ฅผ ์ถ”์ถœํ•ด JSON์œผ๋กœ๋งŒ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”: ๊ธˆ์š”์ผ๊นŒ์ง€ Mina์—๊ฒŒ ์—ฐ๋ฝ, ์šฐ์„ ์ˆœ์œ„ ๋†’์Œ, ์˜ˆ์‚ฐ ์•ฝ 2400๋‹ฌ๋Ÿฌ, ์ฃผ์ œ๋Š” launch video edits.",
"thinking": False,
},
"ํ•จ์ˆ˜ ํ˜ธ์ถœ ์Šคํƒ€์ผ": {
"system": (
"๋‹น์‹ ์€ ํ•„์š”ํ•  ๋•Œ ๋„๊ตฌ ์‚ฌ์šฉ์„ ๊ณ„ํšํ•˜๋Š” ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค. "
"reasoning ํŒจ๋„์—์„œ ์–ด๋–ค ๋„๊ตฌ๋ฅผ ์“ธ์ง€ ์ •๋ฆฌํ•˜๊ณ , assistant ํŒจ๋„์—์„œ ์ตœ์ข… ๊ฒฐ๊ณผ๋ฅผ ํ•œ๊ตญ์–ด๋กœ๋งŒ ๋ช…ํ™•ํ•˜๊ฒŒ ์ œ์‹œํ•˜์„ธ์š”."
),
"prompt": (
"๋„๊ตฌ๋ฅผ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋‹ค๊ณ  ๊ฐ€์ •ํ•˜์„ธ์š”. 18.75 * 42 - 199 ๊ณ„์‚ฐ๊ณผ 12km๋ฅผ ๋งˆ์ผ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ์ž‘์—…์— ๋Œ€ํ•ด "
"reasoning์—๋Š” ๋„๊ตฌ ์‚ฌ์šฉ ๊ณ„ํš์„, assistant์—๋Š” ์ตœ์ข… ์ˆ˜์น˜ ๊ฒฐ๊ณผ๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”."
),
"thinking": True,
},
"์ฐฝ์ž‘": {
"system": "์ƒ์ƒํ•˜๊ณ  ๋ฐ€๋„ ์žˆ๊ฒŒ ํ•œ๊ตญ์–ด ๋ฌธ์žฅ์„ ์ž‘์„ฑํ•˜์„ธ์š”. ์™ธ๊ตญ์–ด ํ‘œํ˜„์„ ์„ž์ง€ ๋งˆ์„ธ์š”.",
"prompt": "ํ‘œ๋ฅ˜ํ•˜๋Š” ๋ฐ•๋ฌผ๊ด€ ์šฐ์ฃผ์„ ์„ ๋ฐฐ๊ฒฝ์œผ๋กœ ํ•œ SF ํ•˜์ด์ŠคํŠธ ์ด์•ผ๊ธฐ์˜ ๋„์ž…๋ถ€๋ฅผ ์ž‘์„ฑํ•˜์„ธ์š”. reasoning์—๋Š” ๋ถ„์œ„๊ธฐ์™€ ์ „๊ฐœ ๋ฐฉํ–ฅ์„, assistant์—๋Š” ์ตœ์ข… ํ•œ๊ตญ์–ด ๋‘ ๋ฌธ์žฅ์„ ์ž‘์„ฑํ•˜์„ธ์š”.",
"thinking": False,
},
}
torch.set_num_threads(N_THREADS)
try:
torch.set_num_interop_threads(max(1, min(2, N_THREADS)))
except RuntimeError:
pass
_tokenizer = None
_model = None
_load_lock = threading.Lock()
_generate_lock = threading.Lock()
def make_chatbot(label, height=520):
kwargs = {"label": label, "height": height}
if "type" in inspect.signature(gr.Chatbot.__init__).parameters:
kwargs["type"] = "messages"
return gr.Chatbot(**kwargs)
def get_model():
global _tokenizer, _model
if _model is None or _tokenizer is None:
with _load_lock:
if _model is None or _tokenizer is None:
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32,
)
_model.eval()
return _tokenizer, _model
def clone_messages(messages):
return [dict(item) for item in (messages or [])]
def load_preset(name):
preset = PRESETS[name]
return (
preset["system"],
preset["prompt"],
preset["thinking"],
)
def clear_all():
return [], [], [], ""
def strip_non_think_specials(text):
text = text or ""
for token in ["<|im_end|>", "<|endoftext|>", "<๏ฝœendโ–ofโ–sentence๏ฝœ>"]:
text = text.replace(token, "")
return text
def final_cleanup(text):
text = strip_non_think_specials(text)
text = text.replace("<think>", "").replace("</think>", "")
return text.strip()
def split_stream_text(raw_text, thinking):
raw_text = strip_non_think_specials(raw_text)
if not thinking:
return "", final_cleanup(raw_text), False
raw_text = raw_text.replace("<think>", "")
if "</think>" in raw_text:
reasoning, answer = raw_text.split("</think>", 1)
return reasoning.strip(), answer.strip(), True
return raw_text.strip(), "", False
def build_messages(system_prompt, message, short_history, thinking):
final_system_prompt = (system_prompt or "").strip() or DEFAULT_SYSTEM_PROMPT
final_user_message = (message or "").strip()
if thinking:
final_system_prompt += BASE_THINKING_SUFFIX
final_user_message += BASE_USER_SUFFIX_THINKING
return [
{"role": "system", "content": final_system_prompt},
*short_history,
{"role": "user", "content": final_user_message},
]
def respond_stream(
message,
system_prompt,
thinking,
model_history,
reasoning_chat,
answer_chat,
):
message = (message or "").strip()
if not message:
yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history or []), ""
return
model_history = list(model_history or [])
reasoning_chat = clone_messages(reasoning_chat)
answer_chat = clone_messages(answer_chat)
reasoning_chat.append({"role": "user", "content": message})
reasoning_chat.append(
{
"role": "assistant",
"content": "(thinking...)" if thinking else "(reasoning disabled)",
}
)
answer_chat.append({"role": "user", "content": message})
answer_chat.append({"role": "assistant", "content": ""})
yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
try:
tokenizer, model = get_model()
short_history = model_history[-2 * MAX_HISTORY_TURNS :]
messages = build_messages(system_prompt, message, short_history, thinking)
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=thinking,
)
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"][:, -MAX_INPUT_TOKENS:]
attention_mask = inputs["attention_mask"][:, -MAX_INPUT_TOKENS:]
streamer = TextIteratorStreamer(
tokenizer,
skip_prompt=True,
skip_special_tokens=False,
clean_up_tokenization_spaces=False,
timeout=None,
)
generation_kwargs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"max_new_tokens": MAX_NEW_TOKENS,
"do_sample": True,
"temperature": 0.6 if thinking else 0.7,
"top_p": 0.95 if thinking else 0.8,
"top_k": 20,
"repetition_penalty": 1.05,
"pad_token_id": tokenizer.eos_token_id,
"streamer": streamer,
}
generation_error = {}
def run_generation():
try:
with _generate_lock:
model.generate(**generation_kwargs)
except Exception as exc:
generation_error["message"] = str(exc)
streamer.on_finalized_text("", stream_end=True)
thread = threading.Thread(target=run_generation, daemon=True)
thread.start()
raw_text = ""
saw_end_think = False
for chunk in streamer:
raw_text += chunk
reasoning_text, answer_text, saw_end_now = split_stream_text(raw_text, thinking)
saw_end_think = saw_end_think or saw_end_now
if thinking:
if saw_end_think:
reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
else:
reasoning_chat[-1]["content"] = reasoning_text or "(thinking...)"
else:
reasoning_chat[-1]["content"] = "(reasoning disabled)"
answer_chat[-1]["content"] = answer_text
yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
thread.join()
if generation_error:
reasoning_chat[-1]["content"] = ""
answer_chat[-1]["content"] = f"Error while running the local CPU model: {generation_error['message']}"
yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
return
reasoning_text, answer_text, saw_end_think = split_stream_text(raw_text, thinking)
if thinking and not saw_end_think:
reasoning_text = ""
answer_text = final_cleanup(raw_text)
if thinking:
reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
else:
reasoning_chat[-1]["content"] = "(reasoning disabled)"
answer_chat[-1]["content"] = answer_text or "(empty response)"
model_history = short_history + [
{"role": "user", "content": message},
{"role": "assistant", "content": answer_chat[-1]["content"]},
]
yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
except Exception as exc:
reasoning_chat[-1]["content"] = ""
answer_chat[-1]["content"] = f"Error while preparing the local CPU model: {exc}"
yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
with gr.Blocks(title="๋กœ์ปฌ CPU ๋ถ„๋ฆฌํ˜• ์ถ”๋ก  ์ฑ„ํŒ…") as demo:
gr.Markdown(
"# ๋กœ์ปฌ CPU ๋ถ„๋ฆฌํ˜• ์ถ”๋ก  ์ฑ„ํŒ…\n"
f"๋กœ์ปฌ CPU์—์„œ `{MODEL_ID}` ๋ชจ๋ธ์„ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค. GGUF๋‚˜ ์™ธ๋ถ€ ์ถ”๋ก  API๋Š” ์‚ฌ์šฉํ•˜์ง€ ์•Š์Šต๋‹ˆ๋‹ค.\n\n"
"์ฒซ ์š”์ฒญ์—์„œ๋Š” ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ๊ฐ€ ํ•„์š”ํ•  ์ˆ˜ ์žˆ์–ด ์ดˆ๊ธฐ ์‘๋‹ต์ด ์กฐ๊ธˆ ๋А๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.\n\n"
"๊ธฐ๋ณธ ์„ค์ •์€ ํ•œ๊ตญ์–ด ๋‹ต๋ณ€ ์šฐ์„ ์ด๋ฉฐ, reasoning ํŒจ๋„๊ณผ ๋‹ต๋ณ€ ํŒจ๋„์„ ๋ถ„๋ฆฌํ•ด์„œ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค.\n\n"
"reasoning๊ณผ assistant์— ๋ณ„๋„์˜ ๊ฐœ๋ณ„ ๊ธธ์ด ์ œํ•œ์€ ๋‘์ง€ ์•Š๊ณ , ์ „์ฒด ์ƒ์„ฑ ๊ธธ์ด๋ฅผ ๋„‰๋„‰ํ•˜๊ฒŒ ์„ค์ •ํ–ˆ์Šต๋‹ˆ๋‹ค."
)
with gr.Row():
preset = gr.Dropdown(
choices=list(PRESETS.keys()),
value="์ˆ˜ํ•™",
label="ํ”„๋ฆฌ์…‹ ํ”„๋กฌํ”„ํŠธ",
)
thinking = gr.Checkbox(label="์ถ”๋ก  ์‚ฌ์šฉ", value=True)
system_prompt = gr.Textbox(
label="์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ",
value=PRESETS["์ˆ˜ํ•™"]["system"],
lines=4,
)
user_input = gr.Textbox(
label="์‚ฌ์šฉ์ž ๋ฉ”์‹œ์ง€",
value=PRESETS["์ˆ˜ํ•™"]["prompt"],
lines=5,
)
with gr.Row():
send_btn = gr.Button("์ „์†ก", variant="primary")
clear_btn = gr.Button("์ง€์šฐ๊ธฐ")
with gr.Row():
reasoning_bot = make_chatbot("์ถ”๋ก ", height=520)
answer_bot = make_chatbot("๋‹ต๋ณ€", height=520)
model_history_state = gr.State([])
preset.change(
fn=load_preset,
inputs=preset,
outputs=[system_prompt, user_input, thinking],
)
send_btn.click(
fn=respond_stream,
inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
)
user_input.submit(
fn=respond_stream,
inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
)
clear_btn.click(
fn=clear_all,
inputs=None,
outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
)
demo.queue()
demo.launch()