Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| IS_SPACE = bool(os.environ.get("SPACE_ID")) | |
| _EXTRACTION_SYSTEM = ( | |
| "You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n" | |
| "1. affinity_delta: integer from -8 to +8 — how much the visitor deepened the bond this turn. " | |
| "Use this scale and be GENEROUS — grief, loss, fear, and tender memories are the heart of this game and " | |
| "should score high (+6 to +8):\n" | |
| " +7 to +8: shared something vulnerable or intimate — a fear, a loss, grief, loneliness, a tender memory.\n" | |
| " +4 to +6: shared a personal fact or memory, or was warm and caring.\n" | |
| " +1 to +3: ordinary politeness or small talk.\n" | |
| " 0: neutral or off-topic.\n" | |
| " -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n" | |
| "2. new_memories: AT MOST ONE memory — a single, self-contained personal memory the VISITOR revealed " | |
| "about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge " | |
| "the turn's personal details into ONE clean sentence a person could claim in first person (e.g. " | |
| "[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several " | |
| "fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n" | |
| "3. tone_delta: integer from -10 to +10 — how the visitor TREATED Hollow this turn. This is distinct " | |
| "from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is " | |
| "negative on both.\n" | |
| " +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n" | |
| " +1 to +3: friendly or caring phrasing directed at Hollow.\n" | |
| " 0: neutral, flat, matter-of-fact — sharing facts or memories WITHOUT warmth is 0, not positive.\n" | |
| " -2 to -5: dismissive, cold, impatient.\n" | |
| " -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n" | |
| "4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase " | |
| "EXACTLY as the visitor wrote it — the visitor's words, never Hollow's. null otherwise.\n" | |
| "5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, " | |
| "letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n" | |
| "All integers must be plain JSON numbers — never write a leading + sign.\n" | |
| "Respond ONLY with valid JSON. No markdown. Examples:\n" | |
| '{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n' | |
| '{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}' | |
| ) | |
| def _build_extract_messages(user_msg: str, reply: str) -> list: | |
| return [ | |
| {"role": "system", "content": _EXTRACTION_SYSTEM}, | |
| {"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"}, | |
| ] | |
| if IS_SPACE: | |
| import spaces | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| _MODEL_ID = "Qwen/Qwen3-8B" | |
| _tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID) | |
| _model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16) | |
| def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): | |
| _model.to("cuda") | |
| def _generate(messages, max_tokens, temperature, repetition_penalty=1.0): | |
| # transformers 5.x returns BatchEncoding, not a plain tensor | |
| tokenized = _tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| enable_thinking=False, | |
| return_tensors="pt", | |
| ).to("cuda") | |
| if hasattr(tokenized, "input_ids"): | |
| input_ids = tokenized["input_ids"] | |
| generate_kwargs = dict(tokenized) | |
| else: | |
| input_ids = tokenized | |
| generate_kwargs = {"input_ids": tokenized} | |
| with torch.no_grad(): | |
| out = _model.generate( | |
| **generate_kwargs, | |
| max_new_tokens=max_tokens, | |
| do_sample=temperature > 0, | |
| temperature=temperature if temperature > 0 else 1.0, | |
| repetition_penalty=repetition_penalty, | |
| no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0, | |
| pad_token_id=_tokenizer.eos_token_id, | |
| ) | |
| return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip() | |
| # repetition_penalty + no_repeat_ngram on the reply only — stops Hollow | |
| # echoing its own previous line verbatim. Extraction stays deterministic. | |
| reply = _generate(chat_messages, gen_max_tokens, temperature=0.8, | |
| repetition_penalty=1.3) | |
| raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0) | |
| return reply, raw_json | |
| from transformers import TextIteratorStreamer | |
| from threading import Thread | |
| def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): | |
| _model.to("cuda") | |
| def _generate(messages, max_tokens, temperature, repetition_penalty=1.0): | |
| # transformers 5.x returns BatchEncoding, not a plain tensor | |
| tokenized = _tokenizer.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, | |
| enable_thinking=False, | |
| return_tensors="pt", | |
| ).to("cuda") | |
| if hasattr(tokenized, "input_ids"): | |
| input_ids = tokenized["input_ids"] | |
| generate_kwargs = dict(tokenized) | |
| else: | |
| input_ids = tokenized | |
| generate_kwargs = {"input_ids": tokenized} | |
| with torch.no_grad(): | |
| out = _model.generate( | |
| **generate_kwargs, | |
| max_new_tokens=max_tokens, | |
| do_sample=temperature > 0, | |
| temperature=temperature if temperature > 0 else 1.0, | |
| repetition_penalty=repetition_penalty, | |
| no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0, | |
| pad_token_id=_tokenizer.eos_token_id, | |
| ) | |
| return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip() | |
| tokenized = _tokenizer.apply_chat_template( | |
| chat_messages, add_generation_prompt=True, enable_thinking=False, | |
| return_tensors="pt").to("cuda") | |
| gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized} | |
| streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True, | |
| temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4, | |
| pad_token_id=_tokenizer.eos_token_id, streamer=streamer) | |
| thread = Thread(target=_model.generate, kwargs=gen_kwargs) | |
| thread.start() | |
| reply = "" | |
| for piece in streamer: | |
| reply += piece | |
| yield reply | |
| thread.join() | |
| raw_json = _generate(_build_extract_messages(user_msg, reply.strip()), | |
| extract_max_tokens, temperature=0.0) | |
| yield ("__final__", reply.strip(), raw_json) | |
| else: | |
| import requests | |
| _OLLAMA_URL = "http://localhost:11434/api/chat" | |
| _MODEL = "qwen3:8b" | |
| def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0): | |
| payload = { | |
| "model": _MODEL, | |
| "messages": messages, | |
| "stream": False, | |
| "think": False, | |
| "options": {"num_predict": max_tokens, "temperature": temperature, | |
| "repeat_penalty": repeat_penalty, "repeat_last_n": 256}, | |
| } | |
| r = requests.post(_OLLAMA_URL, json=payload, timeout=120) | |
| r.raise_for_status() | |
| return r.json()["message"]["content"].strip() | |
| def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): | |
| # repeat_penalty on the reply only — stops Hollow echoing its own line | |
| reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8, | |
| repeat_penalty=1.3) | |
| raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0) | |
| return reply, raw_json | |
| def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80): | |
| """Generator: yields the cumulative reply string as tokens arrive, then | |
| yields a final ("__final__", reply, raw_json) tuple. Extraction runs once | |
| after the stream (deterministic, not streamed).""" | |
| payload = { | |
| "model": _MODEL, | |
| "messages": chat_messages, | |
| "stream": True, | |
| "think": False, | |
| "options": {"num_predict": gen_max_tokens, "temperature": 0.8, | |
| "repeat_penalty": 1.3, "repeat_last_n": 256}, | |
| } | |
| reply = "" | |
| with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r: | |
| r.raise_for_status() | |
| for line in r.iter_lines(): | |
| if not line: | |
| continue | |
| chunk = json.loads(line) | |
| piece = chunk.get("message", {}).get("content", "") | |
| if piece: | |
| reply += piece | |
| yield reply | |
| if chunk.get("done"): | |
| break | |
| raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()), | |
| extract_max_tokens, temperature=0.0) | |
| yield ("__final__", reply.strip(), raw_json) | |