Spaces:

puresoulwd
/

test-3

Sleeping

App Files Files Community

test-3 / app.py

puresoulwd

Upload app.py

0e7bd0b verified 17 days ago

raw

history blame contribute delete

13.5 kB

	import inspect
	import os
	import threading

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer


	os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

	MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-0.6B")
	MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "4096"))
	MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "4096"))
	MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "3"))
	N_THREADS = int(os.getenv("N_THREADS", str(max(1, os.cpu_count() or 1))))
	DEFAULT_SYSTEM_PROMPT = os.getenv(
	"SYSTEM_PROMPT",
	"당신은 유용한 한국어 AI 어시스턴트입니다. 모든 자연어 응답은 반드시 한국어로만 작성하세요. 먼저 차근차근 생각한 뒤, 최종 답변은 명확한 한국어로 작성하세요.",
	)

	BASE_THINKING_SUFFIX = (
	"\n\nthinking 모드가 켜져 있으면 반드시 두 부분을 모두 완성하세요: "
	"(1) reasoning 영역의 추론 내용과 "
	"(2) 추론 종료 후 assistant 영역의 최종 답변. "
	"추론만 출력하고 끝내지 마세요. reasoning과 assistant의 모든 자연어 문장은 반드시 한국어로만 작성하세요."
	)

	BASE_USER_SUFFIX_THINKING = (
	"\n\n먼저 reasoning을 작성하고, 그 다음 assistant 최종 답변을 반드시 이어서 작성하세요. "
	"reasoning과 최종 답변의 자연어 문장은 반드시 한국어만 사용하세요. 최종 답변은 자연스럽고 분명한 한국어로 포함하세요."
	)

	PRESETS = {
	"수학": {
	"system": (
	"당신은 꼼꼼한 수학 튜터입니다. 문제를 정확히 풀이하세요. "
	"추론은 reasoning 패널에, 최종 답변은 assistant 패널에 한국어로만 간단명료하게 작성하세요."
	),
	"prompt": "다음 이차방정식을 풀이하고 계산 과정을 reasoning에, 최종 근을 assistant에 작성하세요: 2x^2 - 7x + 3 = 0.",
	"thinking": True,
	},
	"코딩": {
	"system": (
	"당신은 파이썬 도우미입니다. 읽기 쉽고 정확한 코드를 작성하세요. "
	"계획은 reasoning 패널에, 최종 코드는 assistant 패널에 작성하고 설명은 한국어로만 작성하세요."
	),
	"prompt": (
	"정렬된 두 리스트를 하나의 정렬된 리스트로 합치는 "
	"merge_sorted_lists(a, b) 파이썬 함수를 작성하세요. reasoning에는 접근 방법을, assistant에는 최종 코드와 예시 호출을 작성하세요."
	),
	"thinking": True,
	},
	"구조화 출력": {
	"system": "assistant 최종 답변에는 군더더기 없이 compact JSON만 출력하세요. JSON 바깥의 자연어 설명은 쓰지 마세요.",
	"prompt": "다음 메모에서 필요한 정보를 추출해 JSON으로만 반환하세요: 금요일까지 Mina에게 연락, 우선순위 높음, 예산 약 2400달러, 주제는 launch video edits.",
	"thinking": False,
	},
	"함수 호출 스타일": {
	"system": (
	"당신은 필요할 때 도구 사용을 계획하는 어시스턴트입니다. "
	"reasoning 패널에서 어떤 도구를 쓸지 정리하고, assistant 패널에서 최종 결과를 한국어로만 명확하게 제시하세요."
	),
	"prompt": (
	"도구를 사용할 수 있다고 가정하세요. 18.75 * 42 - 199 계산과 12km를 마일로 변환하는 작업에 대해 "
	"reasoning에는 도구 사용 계획을, assistant에는 최종 수치 결과를 작성하세요."
	),
	"thinking": True,
	},
	"창작": {
	"system": "생생하고 밀도 있게 한국어 문장을 작성하세요. 외국어 표현을 섞지 마세요.",
	"prompt": "표류하는 박물관 우주선을 배경으로 한 SF 하이스트 이야기의 도입부를 작성하세요. reasoning에는 분위기와 전개 방향을, assistant에는 최종 한국어 두 문장을 작성하세요.",
	"thinking": False,
	},
	}


	torch.set_num_threads(N_THREADS)
	try:
	torch.set_num_interop_threads(max(1, min(2, N_THREADS)))
	except RuntimeError:
	pass

	_tokenizer = None
	_model = None
	_load_lock = threading.Lock()
	_generate_lock = threading.Lock()


	def make_chatbot(label, height=520):
	kwargs = {"label": label, "height": height}
	if "type" in inspect.signature(gr.Chatbot.__init__).parameters:
	kwargs["type"] = "messages"
	return gr.Chatbot(**kwargs)


	def get_model():
	global _tokenizer, _model
	if _model is None or _tokenizer is None:
	with _load_lock:
	if _model is None or _tokenizer is None:
	_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
	_model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float32,
	)
	_model.eval()
	return _tokenizer, _model


	def clone_messages(messages):
	return [dict(item) for item in (messages or [])]


	def load_preset(name):
	preset = PRESETS[name]
	return (
	preset["system"],
	preset["prompt"],
	preset["thinking"],
	)


	def clear_all():
	return [], [], [], ""


	def strip_non_think_specials(text):
	text = text or ""
	for token in ["<\|im_end\|>", "<\|endoftext\|>", "<｜end▁of▁sentence｜>"]:
	text = text.replace(token, "")
	return text


	def final_cleanup(text):
	text = strip_non_think_specials(text)
	text = text.replace("<think>", "").replace("</think>", "")
	return text.strip()


	def split_stream_text(raw_text, thinking):
	raw_text = strip_non_think_specials(raw_text)
	if not thinking:
	return "", final_cleanup(raw_text), False

	raw_text = raw_text.replace("<think>", "")
	if "</think>" in raw_text:
	reasoning, answer = raw_text.split("</think>", 1)
	return reasoning.strip(), answer.strip(), True

	return raw_text.strip(), "", False


	def build_messages(system_prompt, message, short_history, thinking):
	final_system_prompt = (system_prompt or "").strip() or DEFAULT_SYSTEM_PROMPT
	final_user_message = (message or "").strip()

	if thinking:
	final_system_prompt += BASE_THINKING_SUFFIX
	final_user_message += BASE_USER_SUFFIX_THINKING

	return [
	{"role": "system", "content": final_system_prompt},
	*short_history,
	{"role": "user", "content": final_user_message},
	]


	def respond_stream(
	message,
	system_prompt,
	thinking,
	model_history,
	reasoning_chat,
	answer_chat,
	):
	message = (message or "").strip()
	if not message:
	yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history or []), ""
	return

	model_history = list(model_history or [])
	reasoning_chat = clone_messages(reasoning_chat)
	answer_chat = clone_messages(answer_chat)

	reasoning_chat.append({"role": "user", "content": message})
	reasoning_chat.append(
	{
	"role": "assistant",
	"content": "(thinking...)" if thinking else "(reasoning disabled)",
	}
	)
	answer_chat.append({"role": "user", "content": message})
	answer_chat.append({"role": "assistant", "content": ""})

	yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""

	try:
	tokenizer, model = get_model()
	short_history = model_history[-2 * MAX_HISTORY_TURNS :]
	messages = build_messages(system_prompt, message, short_history, thinking)

	prompt = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	enable_thinking=thinking,
	)
	inputs = tokenizer(prompt, return_tensors="pt")
	input_ids = inputs["input_ids"][:, -MAX_INPUT_TOKENS:]
	attention_mask = inputs["attention_mask"][:, -MAX_INPUT_TOKENS:]

	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=False,
	clean_up_tokenization_spaces=False,
	timeout=None,
	)

	generation_kwargs = {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	"max_new_tokens": MAX_NEW_TOKENS,
	"do_sample": True,
	"temperature": 0.6 if thinking else 0.7,
	"top_p": 0.95 if thinking else 0.8,
	"top_k": 20,
	"repetition_penalty": 1.05,
	"pad_token_id": tokenizer.eos_token_id,
	"streamer": streamer,
	}

	generation_error = {}

	def run_generation():
	try:
	with _generate_lock:
	model.generate(**generation_kwargs)
	except Exception as exc:
	generation_error["message"] = str(exc)
	streamer.on_finalized_text("", stream_end=True)

	thread = threading.Thread(target=run_generation, daemon=True)
	thread.start()

	raw_text = ""
	saw_end_think = False

	for chunk in streamer:
	raw_text += chunk
	reasoning_text, answer_text, saw_end_now = split_stream_text(raw_text, thinking)
	saw_end_think = saw_end_think or saw_end_now

	if thinking:
	if saw_end_think:
	reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
	else:
	reasoning_chat[-1]["content"] = reasoning_text or "(thinking...)"
	else:
	reasoning_chat[-1]["content"] = "(reasoning disabled)"

	answer_chat[-1]["content"] = answer_text
	yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""

	thread.join()

	if generation_error:
	reasoning_chat[-1]["content"] = ""
	answer_chat[-1]["content"] = f"Error while running the local CPU model: {generation_error['message']}"
	yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
	return

	reasoning_text, answer_text, saw_end_think = split_stream_text(raw_text, thinking)
	if thinking and not saw_end_think:
	reasoning_text = ""
	answer_text = final_cleanup(raw_text)

	if thinking:
	reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
	else:
	reasoning_chat[-1]["content"] = "(reasoning disabled)"

	answer_chat[-1]["content"] = answer_text or "(empty response)"
	model_history = short_history + [
	{"role": "user", "content": message},
	{"role": "assistant", "content": answer_chat[-1]["content"]},
	]

	yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""

	except Exception as exc:
	reasoning_chat[-1]["content"] = ""
	answer_chat[-1]["content"] = f"Error while preparing the local CPU model: {exc}"
	yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""


	with gr.Blocks(title="로컬 CPU 분리형 추론 채팅") as demo:
	gr.Markdown(
	"# 로컬 CPU 분리형 추론 채팅\n"
	f"로컬 CPU에서 `{MODEL_ID}` 모델을 실행합니다. GGUF나 외부 추론 API는 사용하지 않습니다.\n\n"
	"첫 요청에서는 모델 다운로드가 필요할 수 있어 초기 응답이 조금 느릴 수 있습니다.\n\n"
	"기본 설정은 한국어 답변 우선이며, reasoning 패널과 답변 패널을 분리해서 보여줍니다.\n\n"
	"reasoning과 assistant에 별도의 개별 길이 제한은 두지 않고, 전체 생성 길이를 넉넉하게 설정했습니다."
	)

	with gr.Row():
	preset = gr.Dropdown(
	choices=list(PRESETS.keys()),
	value="수학",
	label="프리셋 프롬프트",
	)
	thinking = gr.Checkbox(label="추론 사용", value=True)

	system_prompt = gr.Textbox(
	label="시스템 프롬프트",
	value=PRESETS["수학"]["system"],
	lines=4,
	)

	user_input = gr.Textbox(
	label="사용자 메시지",
	value=PRESETS["수학"]["prompt"],
	lines=5,
	)

	with gr.Row():
	send_btn = gr.Button("전송", variant="primary")
	clear_btn = gr.Button("지우기")

	with gr.Row():
	reasoning_bot = make_chatbot("추론", height=520)
	answer_bot = make_chatbot("답변", height=520)

	model_history_state = gr.State([])

	preset.change(
	fn=load_preset,
	inputs=preset,
	outputs=[system_prompt, user_input, thinking],
	)

	send_btn.click(
	fn=respond_stream,
	inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
	outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
	)
	user_input.submit(
	fn=respond_stream,
	inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
	outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
	)

	clear_btn.click(
	fn=clear_all,
	inputs=None,
	outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
	)


	demo.queue()
	demo.launch()