Spaces:
Running on Zero
Running on Zero
File size: 10,045 Bytes
296d984 d8d8506 e05a544 8c5bc54 b4f9582 328f82b 8c5bc54 b4f9582 ee51d7a 328f82b ee51d7a e05a544 ef89c06 e05a544 d8d8506 cf33c28 d8d8506 b4f9582 d8d8506 240d8c8 cf33c28 d8d8506 cf33c28 d8d8506 cf33c28 d8d8506 240d8c8 d8d8506 cf33c28 d8d8506 240d8c8 d8d8506 296d984 d8d8506 240d8c8 d8d8506 240d8c8 d8d8506 b4f9582 240d8c8 d8d8506 296d984 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | import json
import os
IS_SPACE = bool(os.environ.get("SPACE_ID"))
_EXTRACTION_SYSTEM = (
"You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n"
"1. affinity_delta: integer from -8 to +8 β how much the visitor deepened the bond this turn. "
"Use this scale and be GENEROUS β grief, loss, fear, and tender memories are the heart of this game and "
"should score high (+6 to +8):\n"
" +7 to +8: shared something vulnerable or intimate β a fear, a loss, grief, loneliness, a tender memory.\n"
" +4 to +6: shared a personal fact or memory, or was warm and caring.\n"
" +1 to +3: ordinary politeness or small talk.\n"
" 0: neutral or off-topic.\n"
" -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n"
"2. new_memories: AT MOST ONE memory β a single, self-contained personal memory the VISITOR revealed "
"about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge "
"the turn's personal details into ONE clean sentence a person could claim in first person (e.g. "
"[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several "
"fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n"
"3. tone_delta: integer from -10 to +10 β how the visitor TREATED Hollow this turn. This is distinct "
"from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is "
"negative on both.\n"
" +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n"
" +1 to +3: friendly or caring phrasing directed at Hollow.\n"
" 0: neutral, flat, matter-of-fact β sharing facts or memories WITHOUT warmth is 0, not positive.\n"
" -2 to -5: dismissive, cold, impatient.\n"
" -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n"
"4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase "
"EXACTLY as the visitor wrote it β the visitor's words, never Hollow's. null otherwise.\n"
"5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, "
"letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n"
"All integers must be plain JSON numbers β never write a leading + sign.\n"
"Respond ONLY with valid JSON. No markdown. Examples:\n"
'{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n'
'{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}'
)
def _build_extract_messages(user_msg: str, reply: str) -> list:
return [
{"role": "system", "content": _EXTRACTION_SYSTEM},
{"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"},
]
if IS_SPACE:
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
_MODEL_ID = "Qwen/Qwen3-8B"
_tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID)
_model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16)
@spaces.GPU(duration=90)
def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
_model.to("cuda")
def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
# transformers 5.x returns BatchEncoding, not a plain tensor
tokenized = _tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
return_tensors="pt",
).to("cuda")
if hasattr(tokenized, "input_ids"):
input_ids = tokenized["input_ids"]
generate_kwargs = dict(tokenized)
else:
input_ids = tokenized
generate_kwargs = {"input_ids": tokenized}
with torch.no_grad():
out = _model.generate(
**generate_kwargs,
max_new_tokens=max_tokens,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
pad_token_id=_tokenizer.eos_token_id,
)
return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
# repetition_penalty + no_repeat_ngram on the reply only β stops Hollow
# echoing its own previous line verbatim. Extraction stays deterministic.
reply = _generate(chat_messages, gen_max_tokens, temperature=0.8,
repetition_penalty=1.3)
raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
return reply, raw_json
from transformers import TextIteratorStreamer
from threading import Thread
@spaces.GPU(duration=90)
def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
_model.to("cuda")
def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
# transformers 5.x returns BatchEncoding, not a plain tensor
tokenized = _tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
enable_thinking=False,
return_tensors="pt",
).to("cuda")
if hasattr(tokenized, "input_ids"):
input_ids = tokenized["input_ids"]
generate_kwargs = dict(tokenized)
else:
input_ids = tokenized
generate_kwargs = {"input_ids": tokenized}
with torch.no_grad():
out = _model.generate(
**generate_kwargs,
max_new_tokens=max_tokens,
do_sample=temperature > 0,
temperature=temperature if temperature > 0 else 1.0,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
pad_token_id=_tokenizer.eos_token_id,
)
return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()
tokenized = _tokenizer.apply_chat_template(
chat_messages, add_generation_prompt=True, enable_thinking=False,
return_tensors="pt").to("cuda")
gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized}
streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True)
gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True,
temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4,
pad_token_id=_tokenizer.eos_token_id, streamer=streamer)
thread = Thread(target=_model.generate, kwargs=gen_kwargs)
thread.start()
reply = ""
for piece in streamer:
reply += piece
yield reply
thread.join()
raw_json = _generate(_build_extract_messages(user_msg, reply.strip()),
extract_max_tokens, temperature=0.0)
yield ("__final__", reply.strip(), raw_json)
else:
import requests
_OLLAMA_URL = "http://localhost:11434/api/chat"
_MODEL = "qwen3:8b"
def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0):
payload = {
"model": _MODEL,
"messages": messages,
"stream": False,
"think": False,
"options": {"num_predict": max_tokens, "temperature": temperature,
"repeat_penalty": repeat_penalty, "repeat_last_n": 256},
}
r = requests.post(_OLLAMA_URL, json=payload, timeout=120)
r.raise_for_status()
return r.json()["message"]["content"].strip()
def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
# repeat_penalty on the reply only β stops Hollow echoing its own line
reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8,
repeat_penalty=1.3)
raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
return reply, raw_json
def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
"""Generator: yields the cumulative reply string as tokens arrive, then
yields a final ("__final__", reply, raw_json) tuple. Extraction runs once
after the stream (deterministic, not streamed)."""
payload = {
"model": _MODEL,
"messages": chat_messages,
"stream": True,
"think": False,
"options": {"num_predict": gen_max_tokens, "temperature": 0.8,
"repeat_penalty": 1.3, "repeat_last_n": 256},
}
reply = ""
with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r:
r.raise_for_status()
for line in r.iter_lines():
if not line:
continue
chunk = json.loads(line)
piece = chunk.get("message", {}).get("content", "")
if piece:
reply += piece
yield reply
if chunk.get("done"):
break
raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()),
extract_max_tokens, temperature=0.0)
yield ("__final__", reply.strip(), raw_json)
|