Spaces:

Pabloler21
/

hollow

Sleeping

App Files Files Community

hollow / engine.py

Pabloler21

feat(child): it gropes toward a name as it recovers itself

e05a544 11 days ago

Raw

History Blame Contribute Delete

10 kB

	import json
	import os

	IS_SPACE = bool(os.environ.get("SPACE_ID"))

	_EXTRACTION_SYSTEM = (
	"You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n"
	"1. affinity_delta: integer from -8 to +8 — how much the visitor deepened the bond this turn. "
	"Use this scale and be GENEROUS — grief, loss, fear, and tender memories are the heart of this game and "
	"should score high (+6 to +8):\n"
	" +7 to +8: shared something vulnerable or intimate — a fear, a loss, grief, loneliness, a tender memory.\n"
	" +4 to +6: shared a personal fact or memory, or was warm and caring.\n"
	" +1 to +3: ordinary politeness or small talk.\n"
	" 0: neutral or off-topic.\n"
	" -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n"
	"2. new_memories: AT MOST ONE memory — a single, self-contained personal memory the VISITOR revealed "
	"about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge "
	"the turn's personal details into ONE clean sentence a person could claim in first person (e.g. "
	"[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several "
	"fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n"
	"3. tone_delta: integer from -10 to +10 — how the visitor TREATED Hollow this turn. This is distinct "
	"from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is "
	"negative on both.\n"
	" +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n"
	" +1 to +3: friendly or caring phrasing directed at Hollow.\n"
	" 0: neutral, flat, matter-of-fact — sharing facts or memories WITHOUT warmth is 0, not positive.\n"
	" -2 to -5: dismissive, cold, impatient.\n"
	" -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n"
	"4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase "
	"EXACTLY as the visitor wrote it — the visitor's words, never Hollow's. null otherwise.\n"
	"5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, "
	"letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n"
	"All integers must be plain JSON numbers — never write a leading + sign.\n"
	"Respond ONLY with valid JSON. No markdown. Examples:\n"
	'{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n'
	'{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}'
	)


	def _build_extract_messages(user_msg: str, reply: str) -> list:
	return [
	{"role": "system", "content": _EXTRACTION_SYSTEM},
	{"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"},
	]


	if IS_SPACE:
	import spaces
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer

	_MODEL_ID = "Qwen/Qwen3-8B"
	_tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID)
	_model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16)

	@spaces.GPU(duration=90)
	def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
	_model.to("cuda")

	def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
	# transformers 5.x returns BatchEncoding, not a plain tensor
	tokenized = _tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	enable_thinking=False,
	return_tensors="pt",
	).to("cuda")
	if hasattr(tokenized, "input_ids"):
	input_ids = tokenized["input_ids"]
	generate_kwargs = dict(tokenized)
	else:
	input_ids = tokenized
	generate_kwargs = {"input_ids": tokenized}
	with torch.no_grad():
	out = _model.generate(
	**generate_kwargs,
	max_new_tokens=max_tokens,
	do_sample=temperature > 0,
	temperature=temperature if temperature > 0 else 1.0,
	repetition_penalty=repetition_penalty,
	no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
	pad_token_id=_tokenizer.eos_token_id,
	)
	return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

	# repetition_penalty + no_repeat_ngram on the reply only — stops Hollow
	# echoing its own previous line verbatim. Extraction stays deterministic.
	reply = _generate(chat_messages, gen_max_tokens, temperature=0.8,
	repetition_penalty=1.3)
	raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
	return reply, raw_json

	from transformers import TextIteratorStreamer
	from threading import Thread

	@spaces.GPU(duration=90)
	def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
	_model.to("cuda")

	def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
	# transformers 5.x returns BatchEncoding, not a plain tensor
	tokenized = _tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	enable_thinking=False,
	return_tensors="pt",
	).to("cuda")
	if hasattr(tokenized, "input_ids"):
	input_ids = tokenized["input_ids"]
	generate_kwargs = dict(tokenized)
	else:
	input_ids = tokenized
	generate_kwargs = {"input_ids": tokenized}
	with torch.no_grad():
	out = _model.generate(
	**generate_kwargs,
	max_new_tokens=max_tokens,
	do_sample=temperature > 0,
	temperature=temperature if temperature > 0 else 1.0,
	repetition_penalty=repetition_penalty,
	no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
	pad_token_id=_tokenizer.eos_token_id,
	)
	return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

	tokenized = _tokenizer.apply_chat_template(
	chat_messages, add_generation_prompt=True, enable_thinking=False,
	return_tensors="pt").to("cuda")
	gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized}
	streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True)
	gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True,
	temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4,
	pad_token_id=_tokenizer.eos_token_id, streamer=streamer)
	thread = Thread(target=_model.generate, kwargs=gen_kwargs)
	thread.start()
	reply = ""
	for piece in streamer:
	reply += piece
	yield reply
	thread.join()
	raw_json = _generate(_build_extract_messages(user_msg, reply.strip()),
	extract_max_tokens, temperature=0.0)
	yield ("__final__", reply.strip(), raw_json)

	else:
	import requests

	_OLLAMA_URL = "http://localhost:11434/api/chat"
	_MODEL = "qwen3:8b"

	def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0):
	payload = {
	"model": _MODEL,
	"messages": messages,
	"stream": False,
	"think": False,
	"options": {"num_predict": max_tokens, "temperature": temperature,
	"repeat_penalty": repeat_penalty, "repeat_last_n": 256},
	}
	r = requests.post(_OLLAMA_URL, json=payload, timeout=120)
	r.raise_for_status()
	return r.json()["message"]["content"].strip()

	def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
	# repeat_penalty on the reply only — stops Hollow echoing its own line
	reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8,
	repeat_penalty=1.3)
	raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
	return reply, raw_json

	def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
	"""Generator: yields the cumulative reply string as tokens arrive, then
	yields a final ("__final__", reply, raw_json) tuple. Extraction runs once
	after the stream (deterministic, not streamed)."""
	payload = {
	"model": _MODEL,
	"messages": chat_messages,
	"stream": True,
	"think": False,
	"options": {"num_predict": gen_max_tokens, "temperature": 0.8,
	"repeat_penalty": 1.3, "repeat_last_n": 256},
	}
	reply = ""
	with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r:
	r.raise_for_status()
	for line in r.iter_lines():
	if not line:
	continue
	chunk = json.loads(line)
	piece = chunk.get("message", {}).get("content", "")
	if piece:
	reply += piece
	yield reply
	if chunk.get("done"):
	break
	raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()),
	extract_max_tokens, temperature=0.0)
	yield ("__final__", reply.strip(), raw_json)