Spaces:
Sleeping
Sleeping
| import inspect | |
| import os | |
| import threading | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| os.environ.setdefault("TOKENIZERS_PARALLELISM", "false") | |
| MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-0.6B") | |
| MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", "4096")) | |
| MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "4096")) | |
| MAX_HISTORY_TURNS = int(os.getenv("MAX_HISTORY_TURNS", "3")) | |
| N_THREADS = int(os.getenv("N_THREADS", str(max(1, os.cpu_count() or 1)))) | |
| DEFAULT_SYSTEM_PROMPT = os.getenv( | |
| "SYSTEM_PROMPT", | |
| "๋น์ ์ ์ ์ฉํ ํ๊ตญ์ด AI ์ด์์คํดํธ์ ๋๋ค. ๋ชจ๋ ์์ฐ์ด ์๋ต์ ๋ฐ๋์ ํ๊ตญ์ด๋ก๋ง ์์ฑํ์ธ์. ๋จผ์ ์ฐจ๊ทผ์ฐจ๊ทผ ์๊ฐํ ๋ค, ์ต์ข ๋ต๋ณ์ ๋ช ํํ ํ๊ตญ์ด๋ก ์์ฑํ์ธ์.", | |
| ) | |
| BASE_THINKING_SUFFIX = ( | |
| "\n\nthinking ๋ชจ๋๊ฐ ์ผ์ ธ ์์ผ๋ฉด ๋ฐ๋์ ๋ ๋ถ๋ถ์ ๋ชจ๋ ์์ฑํ์ธ์: " | |
| "(1) reasoning ์์ญ์ ์ถ๋ก ๋ด์ฉ๊ณผ " | |
| "(2) ์ถ๋ก ์ข ๋ฃ ํ assistant ์์ญ์ ์ต์ข ๋ต๋ณ. " | |
| "์ถ๋ก ๋ง ์ถ๋ ฅํ๊ณ ๋๋ด์ง ๋ง์ธ์. reasoning๊ณผ assistant์ ๋ชจ๋ ์์ฐ์ด ๋ฌธ์ฅ์ ๋ฐ๋์ ํ๊ตญ์ด๋ก๋ง ์์ฑํ์ธ์." | |
| ) | |
| BASE_USER_SUFFIX_THINKING = ( | |
| "\n\n๋จผ์ reasoning์ ์์ฑํ๊ณ , ๊ทธ ๋ค์ assistant ์ต์ข ๋ต๋ณ์ ๋ฐ๋์ ์ด์ด์ ์์ฑํ์ธ์. " | |
| "reasoning๊ณผ ์ต์ข ๋ต๋ณ์ ์์ฐ์ด ๋ฌธ์ฅ์ ๋ฐ๋์ ํ๊ตญ์ด๋ง ์ฌ์ฉํ์ธ์. ์ต์ข ๋ต๋ณ์ ์์ฐ์ค๋ฝ๊ณ ๋ถ๋ช ํ ํ๊ตญ์ด๋ก ํฌํจํ์ธ์." | |
| ) | |
| PRESETS = { | |
| "์ํ": { | |
| "system": ( | |
| "๋น์ ์ ๊ผผ๊ผผํ ์ํ ํํฐ์ ๋๋ค. ๋ฌธ์ ๋ฅผ ์ ํํ ํ์ดํ์ธ์. " | |
| "์ถ๋ก ์ reasoning ํจ๋์, ์ต์ข ๋ต๋ณ์ assistant ํจ๋์ ํ๊ตญ์ด๋ก๋ง ๊ฐ๋จ๋ช ๋ฃํ๊ฒ ์์ฑํ์ธ์." | |
| ), | |
| "prompt": "๋ค์ ์ด์ฐจ๋ฐฉ์ ์์ ํ์ดํ๊ณ ๊ณ์ฐ ๊ณผ์ ์ reasoning์, ์ต์ข ๊ทผ์ assistant์ ์์ฑํ์ธ์: 2x^2 - 7x + 3 = 0.", | |
| "thinking": True, | |
| }, | |
| "์ฝ๋ฉ": { | |
| "system": ( | |
| "๋น์ ์ ํ์ด์ฌ ๋์ฐ๋ฏธ์ ๋๋ค. ์ฝ๊ธฐ ์ฝ๊ณ ์ ํํ ์ฝ๋๋ฅผ ์์ฑํ์ธ์. " | |
| "๊ณํ์ reasoning ํจ๋์, ์ต์ข ์ฝ๋๋ assistant ํจ๋์ ์์ฑํ๊ณ ์ค๋ช ์ ํ๊ตญ์ด๋ก๋ง ์์ฑํ์ธ์." | |
| ), | |
| "prompt": ( | |
| "์ ๋ ฌ๋ ๋ ๋ฆฌ์คํธ๋ฅผ ํ๋์ ์ ๋ ฌ๋ ๋ฆฌ์คํธ๋ก ํฉ์น๋ " | |
| "merge_sorted_lists(a, b) ํ์ด์ฌ ํจ์๋ฅผ ์์ฑํ์ธ์. reasoning์๋ ์ ๊ทผ ๋ฐฉ๋ฒ์, assistant์๋ ์ต์ข ์ฝ๋์ ์์ ํธ์ถ์ ์์ฑํ์ธ์." | |
| ), | |
| "thinking": True, | |
| }, | |
| "๊ตฌ์กฐํ ์ถ๋ ฅ": { | |
| "system": "assistant ์ต์ข ๋ต๋ณ์๋ ๊ตฐ๋๋๊ธฐ ์์ด compact JSON๋ง ์ถ๋ ฅํ์ธ์. JSON ๋ฐ๊นฅ์ ์์ฐ์ด ์ค๋ช ์ ์ฐ์ง ๋ง์ธ์.", | |
| "prompt": "๋ค์ ๋ฉ๋ชจ์์ ํ์ํ ์ ๋ณด๋ฅผ ์ถ์ถํด JSON์ผ๋ก๋ง ๋ฐํํ์ธ์: ๊ธ์์ผ๊น์ง Mina์๊ฒ ์ฐ๋ฝ, ์ฐ์ ์์ ๋์, ์์ฐ ์ฝ 2400๋ฌ๋ฌ, ์ฃผ์ ๋ launch video edits.", | |
| "thinking": False, | |
| }, | |
| "ํจ์ ํธ์ถ ์คํ์ผ": { | |
| "system": ( | |
| "๋น์ ์ ํ์ํ ๋ ๋๊ตฌ ์ฌ์ฉ์ ๊ณํํ๋ ์ด์์คํดํธ์ ๋๋ค. " | |
| "reasoning ํจ๋์์ ์ด๋ค ๋๊ตฌ๋ฅผ ์ธ์ง ์ ๋ฆฌํ๊ณ , assistant ํจ๋์์ ์ต์ข ๊ฒฐ๊ณผ๋ฅผ ํ๊ตญ์ด๋ก๋ง ๋ช ํํ๊ฒ ์ ์ํ์ธ์." | |
| ), | |
| "prompt": ( | |
| "๋๊ตฌ๋ฅผ ์ฌ์ฉํ ์ ์๋ค๊ณ ๊ฐ์ ํ์ธ์. 18.75 * 42 - 199 ๊ณ์ฐ๊ณผ 12km๋ฅผ ๋ง์ผ๋ก ๋ณํํ๋ ์์ ์ ๋ํด " | |
| "reasoning์๋ ๋๊ตฌ ์ฌ์ฉ ๊ณํ์, assistant์๋ ์ต์ข ์์น ๊ฒฐ๊ณผ๋ฅผ ์์ฑํ์ธ์." | |
| ), | |
| "thinking": True, | |
| }, | |
| "์ฐฝ์": { | |
| "system": "์์ํ๊ณ ๋ฐ๋ ์๊ฒ ํ๊ตญ์ด ๋ฌธ์ฅ์ ์์ฑํ์ธ์. ์ธ๊ตญ์ด ํํ์ ์์ง ๋ง์ธ์.", | |
| "prompt": "ํ๋ฅํ๋ ๋ฐ๋ฌผ๊ด ์ฐ์ฃผ์ ์ ๋ฐฐ๊ฒฝ์ผ๋ก ํ SF ํ์ด์คํธ ์ด์ผ๊ธฐ์ ๋์ ๋ถ๋ฅผ ์์ฑํ์ธ์. reasoning์๋ ๋ถ์๊ธฐ์ ์ ๊ฐ ๋ฐฉํฅ์, assistant์๋ ์ต์ข ํ๊ตญ์ด ๋ ๋ฌธ์ฅ์ ์์ฑํ์ธ์.", | |
| "thinking": False, | |
| }, | |
| } | |
| torch.set_num_threads(N_THREADS) | |
| try: | |
| torch.set_num_interop_threads(max(1, min(2, N_THREADS))) | |
| except RuntimeError: | |
| pass | |
| _tokenizer = None | |
| _model = None | |
| _load_lock = threading.Lock() | |
| _generate_lock = threading.Lock() | |
| def make_chatbot(label, height=520): | |
| kwargs = {"label": label, "height": height} | |
| if "type" in inspect.signature(gr.Chatbot.__init__).parameters: | |
| kwargs["type"] = "messages" | |
| return gr.Chatbot(**kwargs) | |
| def get_model(): | |
| global _tokenizer, _model | |
| if _model is None or _tokenizer is None: | |
| with _load_lock: | |
| if _model is None or _tokenizer is None: | |
| _tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
| _model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, | |
| ) | |
| _model.eval() | |
| return _tokenizer, _model | |
| def clone_messages(messages): | |
| return [dict(item) for item in (messages or [])] | |
| def load_preset(name): | |
| preset = PRESETS[name] | |
| return ( | |
| preset["system"], | |
| preset["prompt"], | |
| preset["thinking"], | |
| ) | |
| def clear_all(): | |
| return [], [], [], "" | |
| def strip_non_think_specials(text): | |
| text = text or "" | |
| for token in ["<|im_end|>", "<|endoftext|>", "<๏ฝendโofโsentence๏ฝ>"]: | |
| text = text.replace(token, "") | |
| return text | |
| def final_cleanup(text): | |
| text = strip_non_think_specials(text) | |
| text = text.replace("<think>", "").replace("</think>", "") | |
| return text.strip() | |
| def split_stream_text(raw_text, thinking): | |
| raw_text = strip_non_think_specials(raw_text) | |
| if not thinking: | |
| return "", final_cleanup(raw_text), False | |
| raw_text = raw_text.replace("<think>", "") | |
| if "</think>" in raw_text: | |
| reasoning, answer = raw_text.split("</think>", 1) | |
| return reasoning.strip(), answer.strip(), True | |
| return raw_text.strip(), "", False | |
| def build_messages(system_prompt, message, short_history, thinking): | |
| final_system_prompt = (system_prompt or "").strip() or DEFAULT_SYSTEM_PROMPT | |
| final_user_message = (message or "").strip() | |
| if thinking: | |
| final_system_prompt += BASE_THINKING_SUFFIX | |
| final_user_message += BASE_USER_SUFFIX_THINKING | |
| return [ | |
| {"role": "system", "content": final_system_prompt}, | |
| *short_history, | |
| {"role": "user", "content": final_user_message}, | |
| ] | |
| def respond_stream( | |
| message, | |
| system_prompt, | |
| thinking, | |
| model_history, | |
| reasoning_chat, | |
| answer_chat, | |
| ): | |
| message = (message or "").strip() | |
| if not message: | |
| yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history or []), "" | |
| return | |
| model_history = list(model_history or []) | |
| reasoning_chat = clone_messages(reasoning_chat) | |
| answer_chat = clone_messages(answer_chat) | |
| reasoning_chat.append({"role": "user", "content": message}) | |
| reasoning_chat.append( | |
| { | |
| "role": "assistant", | |
| "content": "(thinking...)" if thinking else "(reasoning disabled)", | |
| } | |
| ) | |
| answer_chat.append({"role": "user", "content": message}) | |
| answer_chat.append({"role": "assistant", "content": ""}) | |
| yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), "" | |
| try: | |
| tokenizer, model = get_model() | |
| short_history = model_history[-2 * MAX_HISTORY_TURNS :] | |
| messages = build_messages(system_prompt, message, short_history, thinking) | |
| prompt = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=thinking, | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| input_ids = inputs["input_ids"][:, -MAX_INPUT_TOKENS:] | |
| attention_mask = inputs["attention_mask"][:, -MAX_INPUT_TOKENS:] | |
| streamer = TextIteratorStreamer( | |
| tokenizer, | |
| skip_prompt=True, | |
| skip_special_tokens=False, | |
| clean_up_tokenization_spaces=False, | |
| timeout=None, | |
| ) | |
| generation_kwargs = { | |
| "input_ids": input_ids, | |
| "attention_mask": attention_mask, | |
| "max_new_tokens": MAX_NEW_TOKENS, | |
| "do_sample": True, | |
| "temperature": 0.6 if thinking else 0.7, | |
| "top_p": 0.95 if thinking else 0.8, | |
| "top_k": 20, | |
| "repetition_penalty": 1.05, | |
| "pad_token_id": tokenizer.eos_token_id, | |
| "streamer": streamer, | |
| } | |
| generation_error = {} | |
| def run_generation(): | |
| try: | |
| with _generate_lock: | |
| model.generate(**generation_kwargs) | |
| except Exception as exc: | |
| generation_error["message"] = str(exc) | |
| streamer.on_finalized_text("", stream_end=True) | |
| thread = threading.Thread(target=run_generation, daemon=True) | |
| thread.start() | |
| raw_text = "" | |
| saw_end_think = False | |
| for chunk in streamer: | |
| raw_text += chunk | |
| reasoning_text, answer_text, saw_end_now = split_stream_text(raw_text, thinking) | |
| saw_end_think = saw_end_think or saw_end_now | |
| if thinking: | |
| if saw_end_think: | |
| reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)" | |
| else: | |
| reasoning_chat[-1]["content"] = reasoning_text or "(thinking...)" | |
| else: | |
| reasoning_chat[-1]["content"] = "(reasoning disabled)" | |
| answer_chat[-1]["content"] = answer_text | |
| yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), "" | |
| thread.join() | |
| if generation_error: | |
| reasoning_chat[-1]["content"] = "" | |
| answer_chat[-1]["content"] = f"Error while running the local CPU model: {generation_error['message']}" | |
| yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), "" | |
| return | |
| reasoning_text, answer_text, saw_end_think = split_stream_text(raw_text, thinking) | |
| if thinking and not saw_end_think: | |
| reasoning_text = "" | |
| answer_text = final_cleanup(raw_text) | |
| if thinking: | |
| reasoning_chat[-1]["content"] = reasoning_text or "(no reasoning text returned)" | |
| else: | |
| reasoning_chat[-1]["content"] = "(reasoning disabled)" | |
| answer_chat[-1]["content"] = answer_text or "(empty response)" | |
| model_history = short_history + [ | |
| {"role": "user", "content": message}, | |
| {"role": "assistant", "content": answer_chat[-1]["content"]}, | |
| ] | |
| yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), "" | |
| except Exception as exc: | |
| reasoning_chat[-1]["content"] = "" | |
| answer_chat[-1]["content"] = f"Error while preparing the local CPU model: {exc}" | |
| yield clone_messages(reasoning_chat), clone_messages(answer_chat), list(model_history), "" | |
| with gr.Blocks(title="๋ก์ปฌ CPU ๋ถ๋ฆฌํ ์ถ๋ก ์ฑํ ") as demo: | |
| gr.Markdown( | |
| "# ๋ก์ปฌ CPU ๋ถ๋ฆฌํ ์ถ๋ก ์ฑํ \n" | |
| f"๋ก์ปฌ CPU์์ `{MODEL_ID}` ๋ชจ๋ธ์ ์คํํฉ๋๋ค. GGUF๋ ์ธ๋ถ ์ถ๋ก API๋ ์ฌ์ฉํ์ง ์์ต๋๋ค.\n\n" | |
| "์ฒซ ์์ฒญ์์๋ ๋ชจ๋ธ ๋ค์ด๋ก๋๊ฐ ํ์ํ ์ ์์ด ์ด๊ธฐ ์๋ต์ด ์กฐ๊ธ ๋๋ฆด ์ ์์ต๋๋ค.\n\n" | |
| "๊ธฐ๋ณธ ์ค์ ์ ํ๊ตญ์ด ๋ต๋ณ ์ฐ์ ์ด๋ฉฐ, reasoning ํจ๋๊ณผ ๋ต๋ณ ํจ๋์ ๋ถ๋ฆฌํด์ ๋ณด์ฌ์ค๋๋ค.\n\n" | |
| "reasoning๊ณผ assistant์ ๋ณ๋์ ๊ฐ๋ณ ๊ธธ์ด ์ ํ์ ๋์ง ์๊ณ , ์ ์ฒด ์์ฑ ๊ธธ์ด๋ฅผ ๋๋ํ๊ฒ ์ค์ ํ์ต๋๋ค." | |
| ) | |
| with gr.Row(): | |
| preset = gr.Dropdown( | |
| choices=list(PRESETS.keys()), | |
| value="์ํ", | |
| label="ํ๋ฆฌ์ ํ๋กฌํํธ", | |
| ) | |
| thinking = gr.Checkbox(label="์ถ๋ก ์ฌ์ฉ", value=True) | |
| system_prompt = gr.Textbox( | |
| label="์์คํ ํ๋กฌํํธ", | |
| value=PRESETS["์ํ"]["system"], | |
| lines=4, | |
| ) | |
| user_input = gr.Textbox( | |
| label="์ฌ์ฉ์ ๋ฉ์์ง", | |
| value=PRESETS["์ํ"]["prompt"], | |
| lines=5, | |
| ) | |
| with gr.Row(): | |
| send_btn = gr.Button("์ ์ก", variant="primary") | |
| clear_btn = gr.Button("์ง์ฐ๊ธฐ") | |
| with gr.Row(): | |
| reasoning_bot = make_chatbot("์ถ๋ก ", height=520) | |
| answer_bot = make_chatbot("๋ต๋ณ", height=520) | |
| model_history_state = gr.State([]) | |
| preset.change( | |
| fn=load_preset, | |
| inputs=preset, | |
| outputs=[system_prompt, user_input, thinking], | |
| ) | |
| send_btn.click( | |
| fn=respond_stream, | |
| inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot], | |
| outputs=[reasoning_bot, answer_bot, model_history_state, user_input], | |
| ) | |
| user_input.submit( | |
| fn=respond_stream, | |
| inputs=[user_input, system_prompt, thinking, model_history_state, reasoning_bot, answer_bot], | |
| outputs=[reasoning_bot, answer_bot, model_history_state, user_input], | |
| ) | |
| clear_btn.click( | |
| fn=clear_all, | |
| inputs=None, | |
| outputs=[reasoning_bot, answer_bot, model_history_state, user_input], | |
| ) | |
| demo.queue() | |
| demo.launch() | |