Spaces:

puresoulwd
/

test-3

Sleeping

App Files Files Community

puresoulwd commited on 27 days ago

Commit

0e7bd0b

verified ·

1 Parent(s): 896dd35

Upload app.py

Browse files

Files changed (1) hide show

app.py +362 -0

app.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import inspect
+import os
+import threading
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-0.6B")
+MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "4096"))
+MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "4096"))
+MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "3"))
+N_THREADS = int(os.getenv("N_THREADS", str(max(1, os.cpu_count() or 1))))
+DEFAULT_SYSTEM_PROMPT = os.getenv(
+    "SYSTEM_PROMPT",
+    "당신은 유용한 한국어 AI 어시스턴트입니다. 모든 자연어 응답은 반드시 한국어로만 작성하세요. 먼저 차근차근 생각한 뒤, 최종 답변은 명확한 한국어로 작성하세요.",
+)
+BASE_THINKING_SUFFIX = (
+    "\n\nthinking 모드가 켜져 있으면 반드시 두 부분을 모두 완성하세요: "
+    "(1) reasoning 영역의 추론 내용과 "
+    "(2) 추론 종료 후 assistant 영역의 최종 답변. "
+    "추론만 출력하고 끝내지 마세요. reasoning과 assistant의 모든 자연어 문장은 반드시 한국어로만 작성하세요."
+)
+BASE_USER_SUFFIX_THINKING = (
+    "\n\n먼저 reasoning을 작성하고, 그 다음 assistant 최종 답변을 반드시 이어서 작성하세요. "
+    "reasoning과 최종 답변의 자연어 문장은 반드시 한국어만 사용하세요. 최종 답변은 자연스럽고 분명한 한국어로 포함하세요."
+)
+PRESETS = {
+    "수학": {
+        "system": (
+            "당신은 꼼꼼한 수학 튜터입니다. 문제를 정확히 풀이하세요. "
+            "추론은 reasoning 패널에, 최종 답변은 assistant 패널에 한국어로만 간단명료하게 작성하세요."
+        ),
+        "prompt": "다음 이차방정식을 풀이하고 계산 과정을 reasoning에, 최종 근을 assistant에 작성하세요: 2x^2 - 7x + 3 = 0.",
+        "thinking": True,
+    },
+    "코딩": {
+        "system": (
+            "당신은 파이썬 도우미입니다. 읽기 쉽고 정확한 코드를 작성하세요. "
+            "계획은 reasoning 패널에, 최종 코드는 assistant 패널에 작성하고 설명은 한국어로만 작성하세요."
+        ),
+        "prompt": (
+            "정렬된 두 리스트를 하나의 정렬된 리스트로 합치는 "
+            "merge_sorted_lists(a, b) 파이썬 함수를 작성하세요. reasoning에는 접근 방법을, assistant에는 최종 코드와 예시 호출을 작성하세요."
+        ),
+        "thinking": True,
+    },
+    "구조화 출력": {
+        "system": "assistant 최종 답변에는 군더더기 없이 compact JSON만 출력하세요. JSON 바깥의 자연어 설명은 쓰지 마세요.",
+        "prompt": "다음 메모에서 필요한 정보를 추출해 JSON으로만 반환하세요: 금요일까지 Mina에게 연락, 우선순위 높음, 예산 약 2400달러, 주제는 launch video edits.",
+        "thinking": False,
+    },
+    "함수 호출 스타일": {
+        "system": (
+            "당신은 필요할 때 도구 사용을 계획하는 어시스턴트입니다. "
+            "reasoning 패널에서 어떤 도구를 쓸지 정리하고, assistant 패널에서 최종 결과를 한국어로만 명확하게 제시하세요."
+        ),
+        "prompt": (
+            "도구를 사용할 수 있다고 가정하세요. 18.75 * 42 - 199 계산과 12km를 마일로 변환하는 작업에 대해 "
+            "reasoning에는 도구 사용 계획을, assistant에는 최종 수치 결과를 작성하세요."
+        ),
+        "thinking": True,
+    },
+    "창작": {
+        "system": "생생하고 밀도 있게 한국어 문장을 작성하세요. 외국어 표현을 섞지 마세요.",
+        "prompt": "표류하는 박물관 우주선을 배경으로 한 SF 하이스트 이야기의 도입부를 작성하세요. reasoning에는 분위기와 전개 방향을, assistant에는 최종 한국어 두 문장을 작성하세요.",
+        "thinking": False,
+    },
+}
+torch.set_num_threads(N_THREADS)
+try:
+    torch.set_num_interop_threads(max(1, min(2, N_THREADS)))
+except RuntimeError:
+    pass
+_tokenizer = None
+_model = None
+_load_lock = threading.Lock()
+_generate_lock = threading.Lock()
+def make_chatbot(label, height=520):
+    kwargs = {"label": label, "height": height}
+    if "type" in inspect.signature(gr.Chatbot.__init__).parameters:
+        kwargs["type"] = "messages"
+    return gr.Chatbot(**kwargs)
+def get_model():
+    global _tokenizer, _model
+    if _model is None or _tokenizer is None:
+        with _load_lock:
+            if _model is None or _tokenizer is None:
+                _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+                _model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_ID,
+                    torch_dtype=torch.float32,
+                )
+                _model.eval()
+    return _tokenizer, _model
+def clone_messages(messages):
+    return [dict(item) for item in (messages or [])]
+def load_preset(name):
+    preset = PRESETS[name]
+    return (
+        preset["system"],
+        preset["prompt"],
+        preset["thinking"],
+    )
+def clear_all():
+    return [], [], [], ""
+def strip_non_think_specials(text):
+    text = text or ""
+    for token in ["<|im_end|>", "<|endoftext|>", "<｜end▁of▁sentence｜>"]:
+        text = text.replace(token, "")
+    return text
+def final_cleanup(text):
+    text = strip_non_think_specials(text)
+    text = text.replace("<think>", "").replace("</think>", "")
+    return text.strip()
+def split_stream_text(raw_text, thinking):
+    raw_text = strip_non_think_specials(raw_text)
+    if not thinking:
+        return "", final_cleanup(raw_text), False
+    raw_text = raw_text.replace("<think>", "")
+    if "</think>" in raw_text:
+        reasoning, answer = raw_text.split("</think>", 1)
+        return reasoning.strip(), answer.strip(), True
+    return raw_text.strip(), "", False
+def build_messages(system_prompt, message, short_history, thinking):
+    final_system_prompt = (system_prompt or "").strip() or DEFAULT_SYSTEM_PROMPT
+    final_user_message = (message or "").strip()
+    if thinking:
+        final_system_prompt += BASE_THINKING_SUFFIX
+        final_user_message += BASE_USER_SUFFIX_THINKING
+    return [
+        {"role": "system", "content": final_system_prompt},
+        *short_history,
+        {"role": "user", "content": final_user_message},
+    ]
+def respond_stream(
+    message,
+    system_prompt,
+    thinking,
+    model_history,
+    reasoning_chat,
+    answer_chat,
+):
+    message = (message or "").strip()
+    if not message:
+        yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history or []), ""
+        return
+    model_history = list(model_history or [])
+    reasoning_chat = clone_messages(reasoning_chat)
+    answer_chat = clone_messages(answer_chat)
+    reasoning_chat.append({"role": "user", "content": message})
+    reasoning_chat.append(
+        {
+            "role": "assistant",
+            "content": "(thinking...)" if thinking else "(reasoning disabled)",
+        }
+    )
+    answer_chat.append({"role": "user", "content": message})
+    answer_chat.append({"role": "assistant", "content": ""})
+    yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+    try:
+        tokenizer, model = get_model()
+        short_history = model_history[-2 * MAX_HISTORY_TURNS :]
+        messages = build_messages(system_prompt, message, short_history, thinking)
+        prompt = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=thinking,
+        )
+        inputs = tokenizer(prompt, return_tensors="pt")
+        input_ids = inputs["input_ids"][:, -MAX_INPUT_TOKENS:]
+        attention_mask = inputs["attention_mask"][:, -MAX_INPUT_TOKENS:]
+        streamer = TextIteratorStreamer(
+            tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=False,
+            clean_up_tokenization_spaces=False,
+            timeout=None,
+        )
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "max_new_tokens": MAX_NEW_TOKENS,
+            "do_sample": True,
+            "temperature": 0.6 if thinking else 0.7,
+            "top_p": 0.95 if thinking else 0.8,
+            "top_k": 20,
+            "repetition_penalty": 1.05,
+            "pad_token_id": tokenizer.eos_token_id,
+            "streamer": streamer,
+        }
+        generation_error = {}
+        def run_generation():
+            try:
+                with _generate_lock:
+                    model.generate(**generation_kwargs)
+            except Exception as exc:
+                generation_error["message"] = str(exc)
+                streamer.on_finalized_text("", stream_end=True)
+        thread = threading.Thread(target=run_generation, daemon=True)
+        thread.start()
+        raw_text = ""
+        saw_end_think = False
+        for chunk in streamer:
+            raw_text += chunk
+            reasoning_text, answer_text, saw_end_now = split_stream_text(raw_text, thinking)
+            saw_end_think = saw_end_think or saw_end_now
+            if thinking:
+                if saw_end_think:
+                    reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
+                else:
+                    reasoning_chat[-1]["content"] = reasoning_text or "(thinking...)"
+            else:
+                reasoning_chat[-1]["content"] = "(reasoning disabled)"
+            answer_chat[-1]["content"] = answer_text
+            yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+        thread.join()
+        if generation_error:
+            reasoning_chat[-1]["content"] = ""
+            answer_chat[-1]["content"] = f"Error while running the local CPU model: {generation_error['message']}"
+            yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+            return
+        reasoning_text, answer_text, saw_end_think = split_stream_text(raw_text, thinking)
+        if thinking and not saw_end_think:
+            reasoning_text = ""
+            answer_text = final_cleanup(raw_text)
+        if thinking:
+            reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)"
+        else:
+            reasoning_chat[-1]["content"] = "(reasoning disabled)"
+        answer_chat[-1]["content"] = answer_text or "(empty response)"
+        model_history = short_history + [
+            {"role": "user", "content": message},
+            {"role": "assistant", "content": answer_chat[-1]["content"]},
+        ]
+        yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+    except Exception as exc:
+        reasoning_chat[-1]["content"] = ""
+        answer_chat[-1]["content"] = f"Error while preparing the local CPU model: {exc}"
+        yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), ""
+with gr.Blocks(title="로컬 CPU 분리형 추론 채팅") as demo:
+    gr.Markdown(
+        "# 로컬 CPU 분리형 추론 채팅\n"
+        f"로컬 CPU에서 `{MODEL_ID}` 모델을 실행합니다. GGUF나 외부 추론 API는 사용하지 않습니다.\n\n"
+        "첫 요청에서는 모델 다운로드가 필요할 수 있어 초기 응답이 조금 느릴 수 있습니다.\n\n"
+        "기본 설정은 한국어 답변 우선이며, reasoning 패널과 답변 패널을 분리해서 보여줍니다.\n\n"
+        "reasoning과 assistant에 별도의 개별 길이 제한은 두지 않고, 전체 생성 길이를 넉넉하게 설정했습니다."
+    )
+    with gr.Row():
+        preset = gr.Dropdown(
+            choices=list(PRESETS.keys()),
+            value="수학",
+            label="프리셋 프롬프트",
+        )
+        thinking = gr.Checkbox(label="추론 사용", value=True)
+    system_prompt = gr.Textbox(
+        label="시스템 프롬프트",
+        value=PRESETS["수학"]["system"],
+        lines=4,
+    )
+    user_input = gr.Textbox(
+        label="사용자 메시지",
+        value=PRESETS["수학"]["prompt"],
+        lines=5,
+    )
+    with gr.Row():
+        send_btn = gr.Button("전송", variant="primary")
+        clear_btn = gr.Button("지우기")
+    with gr.Row():
+        reasoning_bot = make_chatbot("추론", height=520)
+        answer_bot = make_chatbot("답변", height=520)
+    model_history_state = gr.State([])
+    preset.change(
+        fn=load_preset,
+        inputs=preset,
+        outputs=[system_prompt, user_input, thinking],
+    )
+    send_btn.click(
+        fn=respond_stream,
+        inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
+        outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
+    )
+    user_input.submit(
+        fn=respond_stream,
+        inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot],
+        outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
+    )
+    clear_btn.click(
+        fn=clear_all,
+        inputs=None,
+        outputs=[reasoning_bot, answer_bot, model_history_state, user_input],
+    )
+demo.queue()
+demo.launch()