hollow / engine.py
Pabloler21's picture
feat(child): it gropes toward a name as it recovers itself
e05a544
Raw
History Blame Contribute Delete
10 kB
import json
import os
IS_SPACE = bool(os.environ.get("SPACE_ID"))
_EXTRACTION_SYSTEM = (
"You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n"
"1. affinity_delta: integer from -8 to +8 — how much the visitor deepened the bond this turn. "
"Use this scale and be GENEROUS — grief, loss, fear, and tender memories are the heart of this game and "
"should score high (+6 to +8):\n"
" +7 to +8: shared something vulnerable or intimate — a fear, a loss, grief, loneliness, a tender memory.\n"
" +4 to +6: shared a personal fact or memory, or was warm and caring.\n"
" +1 to +3: ordinary politeness or small talk.\n"
" 0: neutral or off-topic.\n"
" -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n"
"2. new_memories: AT MOST ONE memory — a single, self-contained personal memory the VISITOR revealed "
"about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge "
"the turn's personal details into ONE clean sentence a person could claim in first person (e.g. "
"[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several "
"fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n"
"3. tone_delta: integer from -10 to +10 — how the visitor TREATED Hollow this turn. This is distinct "
"from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is "
"negative on both.\n"
" +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n"
" +1 to +3: friendly or caring phrasing directed at Hollow.\n"
" 0: neutral, flat, matter-of-fact — sharing facts or memories WITHOUT warmth is 0, not positive.\n"
" -2 to -5: dismissive, cold, impatient.\n"
" -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n"
"4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase "
"EXACTLY as the visitor wrote it — the visitor's words, never Hollow's. null otherwise.\n"
"5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, "
"letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n"
"All integers must be plain JSON numbers — never write a leading + sign.\n"
"Respond ONLY with valid JSON. No markdown. Examples:\n"
'{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n'
'{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}'
)
def _build_extract_messages(user_msg: str, reply: str) -> list:
return [
{"role": "system", "content": _EXTRACTION_SYSTEM},
{"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"},
]
if IS_SPACE:
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
_MODEL_ID = "Qwen/Qwen3-8B"
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID)
_model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16)
@spaces.GPU(duration=90)
def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
_model.to("cuda")
def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
# transformers 5.x returns BatchEncoding, not a plain tensor
tokenized = _tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
return_tensors="pt",
).to("cuda")
if hasattr(tokenized, "input_ids"):
input_ids = tokenized["input_ids"]
generate_kwargs = dict(tokenized)
else:
input_ids = tokenized
generate_kwargs = {"input_ids": tokenized}
with torch.no_grad():
out = _model.generate(
**generate_kwargs,
max_new_tokens=max_tokens,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
pad_token_id=_tokenizer.eos_token_id,
)
return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
# repetition_penalty + no_repeat_ngram on the reply only — stops Hollow
# echoing its own previous line verbatim. Extraction stays deterministic.
reply = _generate(chat_messages, gen_max_tokens, temperature=0.8,
repetition_penalty=1.3)
raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
return reply, raw_json
from transformers import TextIteratorStreamer
from threading import Thread
@spaces.GPU(duration=90)
def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
_model.to("cuda")
def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
# transformers 5.x returns BatchEncoding, not a plain tensor
tokenized = _tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
return_tensors="pt",
).to("cuda")
if hasattr(tokenized, "input_ids"):
input_ids = tokenized["input_ids"]
generate_kwargs = dict(tokenized)
else:
input_ids = tokenized
generate_kwargs = {"input_ids": tokenized}
with torch.no_grad():
out = _model.generate(
**generate_kwargs,
max_new_tokens=max_tokens,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
pad_token_id=_tokenizer.eos_token_id,
)
return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
tokenized = _tokenizer.apply_chat_template(
chat_messages, add_generation_prompt=True, enable_thinking=False,
return_tensors="pt").to("cuda")
gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized}
streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True,
temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4,
pad_token_id=_tokenizer.eos_token_id, streamer=streamer)
thread = Thread(target=_model.generate, kwargs=gen_kwargs)
thread.start()
reply = ""
for piece in streamer:
reply += piece
yield reply
thread.join()
raw_json = _generate(_build_extract_messages(user_msg, reply.strip()),
extract_max_tokens, temperature=0.0)
yield ("__final__", reply.strip(), raw_json)
else:
import requests
_OLLAMA_URL = "http://localhost:11434/api/chat"
_MODEL = "qwen3:8b"
def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0):
payload = {
"model": _MODEL,
"messages": messages,
"stream": False,
"think": False,
"options": {"num_predict": max_tokens, "temperature": temperature,
"repeat_penalty": repeat_penalty, "repeat_last_n": 256},
}
r = requests.post(_OLLAMA_URL, json=payload, timeout=120)
r.raise_for_status()
return r.json()["message"]["content"].strip()
def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
# repeat_penalty on the reply only — stops Hollow echoing its own line
reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8,
repeat_penalty=1.3)
raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
return reply, raw_json
def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
"""Generator: yields the cumulative reply string as tokens arrive, then
yields a final ("__final__", reply, raw_json) tuple. Extraction runs once
after the stream (deterministic, not streamed)."""
payload = {
"model": _MODEL,
"messages": chat_messages,
"stream": True,
"think": False,
"options": {"num_predict": gen_max_tokens, "temperature": 0.8,
"repeat_penalty": 1.3, "repeat_last_n": 256},
}
reply = ""
with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r:
r.raise_for_status()
for line in r.iter_lines():
if not line:
continue
chunk = json.loads(line)
piece = chunk.get("message", {}).get("content", "")
if piece:
reply += piece
yield reply
if chunk.get("done"):
break
raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()),
extract_max_tokens, temperature=0.0)
yield ("__final__", reply.strip(), raw_json)