import json import os IS_SPACE = bool(os.environ.get("SPACE_ID")) _EXTRACTION_SYSTEM = ( "You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n" "1. affinity_delta: integer from -8 to +8 — how much the visitor deepened the bond this turn. " "Use this scale and be GENEROUS — grief, loss, fear, and tender memories are the heart of this game and " "should score high (+6 to +8):\n" " +7 to +8: shared something vulnerable or intimate — a fear, a loss, grief, loneliness, a tender memory.\n" " +4 to +6: shared a personal fact or memory, or was warm and caring.\n" " +1 to +3: ordinary politeness or small talk.\n" " 0: neutral or off-topic.\n" " -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n" "2. new_memories: AT MOST ONE memory — a single, self-contained personal memory the VISITOR revealed " "about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge " "the turn's personal details into ONE clean sentence a person could claim in first person (e.g. " "[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several " "fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n" "3. tone_delta: integer from -10 to +10 — how the visitor TREATED Hollow this turn. This is distinct " "from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is " "negative on both.\n" " +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n" " +1 to +3: friendly or caring phrasing directed at Hollow.\n" " 0: neutral, flat, matter-of-fact — sharing facts or memories WITHOUT warmth is 0, not positive.\n" " -2 to -5: dismissive, cold, impatient.\n" " -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n" "4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase " "EXACTLY as the visitor wrote it — the visitor's words, never Hollow's. null otherwise.\n" "5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, " "letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n" "All integers must be plain JSON numbers — never write a leading + sign.\n" "Respond ONLY with valid JSON. No markdown. Examples:\n" '{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n' '{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}' ) def _build_extract_messages(user_msg: str, reply: str) -> list: return [ {"role": "system", "content": _EXTRACTION_SYSTEM}, {"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"}, ] if IS_SPACE: import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer _MODEL_ID = "Qwen/Qwen3-8B" _tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID) _model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16) @spaces.GPU(duration=90) def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): _model.to("cuda") def _generate(messages, max_tokens, temperature, repetition_penalty=1.0): # transformers 5.x returns BatchEncoding, not a plain tensor tokenized = _tokenizer.apply_chat_template( messages, add_generation_prompt=True, enable_thinking=False, return_tensors="pt", ).to("cuda") if hasattr(tokenized, "input_ids"): input_ids = tokenized["input_ids"] generate_kwargs = dict(tokenized) else: input_ids = tokenized generate_kwargs = {"input_ids": tokenized} with torch.no_grad(): out = _model.generate( **generate_kwargs, max_new_tokens=max_tokens, do_sample=temperature > 0, temperature=temperature if temperature > 0 else 1.0, repetition_penalty=repetition_penalty, no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0, pad_token_id=_tokenizer.eos_token_id, ) return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip() # repetition_penalty + no_repeat_ngram on the reply only — stops Hollow # echoing its own previous line verbatim. Extraction stays deterministic. reply = _generate(chat_messages, gen_max_tokens, temperature=0.8, repetition_penalty=1.3) raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0) return reply, raw_json from transformers import TextIteratorStreamer from threading import Thread @spaces.GPU(duration=90) def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): _model.to("cuda") def _generate(messages, max_tokens, temperature, repetition_penalty=1.0): # transformers 5.x returns BatchEncoding, not a plain tensor tokenized = _tokenizer.apply_chat_template( messages, add_generation_prompt=True, enable_thinking=False, return_tensors="pt", ).to("cuda") if hasattr(tokenized, "input_ids"): input_ids = tokenized["input_ids"] generate_kwargs = dict(tokenized) else: input_ids = tokenized generate_kwargs = {"input_ids": tokenized} with torch.no_grad(): out = _model.generate( **generate_kwargs, max_new_tokens=max_tokens, do_sample=temperature > 0, temperature=temperature if temperature > 0 else 1.0, repetition_penalty=repetition_penalty, no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0, pad_token_id=_tokenizer.eos_token_id, ) return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip() tokenized = _tokenizer.apply_chat_template( chat_messages, add_generation_prompt=True, enable_thinking=False, return_tensors="pt").to("cuda") gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized} streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True, temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4, pad_token_id=_tokenizer.eos_token_id, streamer=streamer) thread = Thread(target=_model.generate, kwargs=gen_kwargs) thread.start() reply = "" for piece in streamer: reply += piece yield reply thread.join() raw_json = _generate(_build_extract_messages(user_msg, reply.strip()), extract_max_tokens, temperature=0.0) yield ("__final__", reply.strip(), raw_json) else: import requests _OLLAMA_URL = "http://localhost:11434/api/chat" _MODEL = "qwen3:8b" def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0): payload = { "model": _MODEL, "messages": messages, "stream": False, "think": False, "options": {"num_predict": max_tokens, "temperature": temperature, "repeat_penalty": repeat_penalty, "repeat_last_n": 256}, } r = requests.post(_OLLAMA_URL, json=payload, timeout=120) r.raise_for_status() return r.json()["message"]["content"].strip() def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): # repeat_penalty on the reply only — stops Hollow echoing its own line reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8, repeat_penalty=1.3) raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0) return reply, raw_json def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): """Generator: yields the cumulative reply string as tokens arrive, then yields a final ("__final__", reply, raw_json) tuple. Extraction runs once after the stream (deterministic, not streamed).""" payload = { "model": _MODEL, "messages": chat_messages, "stream": True, "think": False, "options": {"num_predict": gen_max_tokens, "temperature": 0.8, "repeat_penalty": 1.3, "repeat_last_n": 256}, } reply = "" with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r: r.raise_for_status() for line in r.iter_lines(): if not line: continue chunk = json.loads(line) piece = chunk.get("message", {}).get("content", "") if piece: reply += piece yield reply if chunk.get("done"): break raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()), extract_max_tokens, temperature=0.0) yield ("__final__", reply.strip(), raw_json)