File size: 10,045 Bytes
296d984
d8d8506
 
 
 
 
e05a544
8c5bc54
b4f9582
 
328f82b
 
 
8c5bc54
 
b4f9582
 
 
 
 
ee51d7a
 
 
328f82b
 
 
ee51d7a
 
 
 
e05a544
 
ef89c06
 
e05a544
 
d8d8506
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf33c28
d8d8506
 
b4f9582
d8d8506
 
240d8c8
cf33c28
 
d8d8506
 
 
 
 
cf33c28
 
 
 
 
 
d8d8506
 
cf33c28
d8d8506
 
 
240d8c8
 
d8d8506
 
cf33c28
d8d8506
240d8c8
 
 
 
d8d8506
 
 
296d984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8d8506
 
 
 
 
 
240d8c8
d8d8506
 
 
 
 
240d8c8
 
d8d8506
 
 
 
 
b4f9582
240d8c8
 
 
d8d8506
 
296d984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
import json
import os

IS_SPACE = bool(os.environ.get("SPACE_ID"))

_EXTRACTION_SYSTEM = (
    "You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n"
    "1. affinity_delta: integer from -8 to +8 β€” how much the visitor deepened the bond this turn. "
    "Use this scale and be GENEROUS β€” grief, loss, fear, and tender memories are the heart of this game and "
    "should score high (+6 to +8):\n"
    "   +7 to +8: shared something vulnerable or intimate β€” a fear, a loss, grief, loneliness, a tender memory.\n"
    "   +4 to +6: shared a personal fact or memory, or was warm and caring.\n"
    "   +1 to +3: ordinary politeness or small talk.\n"
    "   0: neutral or off-topic.\n"
    "   -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n"
    "2. new_memories: AT MOST ONE memory β€” a single, self-contained personal memory the VISITOR revealed "
    "about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge "
    "the turn's personal details into ONE clean sentence a person could claim in first person (e.g. "
    "[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several "
    "fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n"
    "3. tone_delta: integer from -10 to +10 β€” how the visitor TREATED Hollow this turn. This is distinct "
    "from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is "
    "negative on both.\n"
    "   +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n"
    "   +1 to +3: friendly or caring phrasing directed at Hollow.\n"
    "   0: neutral, flat, matter-of-fact β€” sharing facts or memories WITHOUT warmth is 0, not positive.\n"
    "   -2 to -5: dismissive, cold, impatient.\n"
    "   -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n"
    "4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase "
    "EXACTLY as the visitor wrote it β€” the visitor's words, never Hollow's. null otherwise.\n"
    "5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, "
    "letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n"
    "All integers must be plain JSON numbers β€” never write a leading + sign.\n"
    "Respond ONLY with valid JSON. No markdown. Examples:\n"
    '{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n'
    '{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}'
)


def _build_extract_messages(user_msg: str, reply: str) -> list:
    return [
        {"role": "system", "content": _EXTRACTION_SYSTEM},
        {"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"},
    ]


if IS_SPACE:
    import spaces
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    _MODEL_ID = "Qwen/Qwen3-8B"
    _tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID)
    _model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16)

    @spaces.GPU(duration=90)
    def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        _model.to("cuda")

        def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
            # transformers 5.x returns BatchEncoding, not a plain tensor
            tokenized = _tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                enable_thinking=False,
                return_tensors="pt",
            ).to("cuda")
            if hasattr(tokenized, "input_ids"):
                input_ids = tokenized["input_ids"]
                generate_kwargs = dict(tokenized)
            else:
                input_ids = tokenized
                generate_kwargs = {"input_ids": tokenized}
            with torch.no_grad():
                out = _model.generate(
                    **generate_kwargs,
                    max_new_tokens=max_tokens,
                    do_sample=temperature > 0,
                    temperature=temperature if temperature > 0 else 1.0,
                    repetition_penalty=repetition_penalty,
                    no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
                    pad_token_id=_tokenizer.eos_token_id,
                )
            return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

        # repetition_penalty + no_repeat_ngram on the reply only β€” stops Hollow
        # echoing its own previous line verbatim. Extraction stays deterministic.
        reply = _generate(chat_messages, gen_max_tokens, temperature=0.8,
                          repetition_penalty=1.3)
        raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
        return reply, raw_json

    from transformers import TextIteratorStreamer
    from threading import Thread

    @spaces.GPU(duration=90)
    def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        _model.to("cuda")

        def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
            # transformers 5.x returns BatchEncoding, not a plain tensor
            tokenized = _tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                enable_thinking=False,
                return_tensors="pt",
            ).to("cuda")
            if hasattr(tokenized, "input_ids"):
                input_ids = tokenized["input_ids"]
                generate_kwargs = dict(tokenized)
            else:
                input_ids = tokenized
                generate_kwargs = {"input_ids": tokenized}
            with torch.no_grad():
                out = _model.generate(
                    **generate_kwargs,
                    max_new_tokens=max_tokens,
                    do_sample=temperature > 0,
                    temperature=temperature if temperature > 0 else 1.0,
                    repetition_penalty=repetition_penalty,
                    no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
                    pad_token_id=_tokenizer.eos_token_id,
                )
            return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

        tokenized = _tokenizer.apply_chat_template(
            chat_messages, add_generation_prompt=True, enable_thinking=False,
            return_tensors="pt").to("cuda")
        gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized}
        streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True)
        gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True,
                          temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4,
                          pad_token_id=_tokenizer.eos_token_id, streamer=streamer)
        thread = Thread(target=_model.generate, kwargs=gen_kwargs)
        thread.start()
        reply = ""
        for piece in streamer:
            reply += piece
            yield reply
        thread.join()
        raw_json = _generate(_build_extract_messages(user_msg, reply.strip()),
                             extract_max_tokens, temperature=0.0)
        yield ("__final__", reply.strip(), raw_json)

else:
    import requests

    _OLLAMA_URL = "http://localhost:11434/api/chat"
    _MODEL = "qwen3:8b"

    def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0):
        payload = {
            "model": _MODEL,
            "messages": messages,
            "stream": False,
            "think": False,
            "options": {"num_predict": max_tokens, "temperature": temperature,
                        "repeat_penalty": repeat_penalty, "repeat_last_n": 256},
        }
        r = requests.post(_OLLAMA_URL, json=payload, timeout=120)
        r.raise_for_status()
        return r.json()["message"]["content"].strip()

    def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        # repeat_penalty on the reply only β€” stops Hollow echoing its own line
        reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8,
                             repeat_penalty=1.3)
        raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
        return reply, raw_json

    def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        """Generator: yields the cumulative reply string as tokens arrive, then
        yields a final ("__final__", reply, raw_json) tuple. Extraction runs once
        after the stream (deterministic, not streamed)."""
        payload = {
            "model": _MODEL,
            "messages": chat_messages,
            "stream": True,
            "think": False,
            "options": {"num_predict": gen_max_tokens, "temperature": 0.8,
                        "repeat_penalty": 1.3, "repeat_last_n": 256},
        }
        reply = ""
        with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r:
            r.raise_for_status()
            for line in r.iter_lines():
                if not line:
                    continue
                chunk = json.loads(line)
                piece = chunk.get("message", {}).get("content", "")
                if piece:
                    reply += piece
                    yield reply
                if chunk.get("done"):
                    break
        raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()),
                                extract_max_tokens, temperature=0.0)
        yield ("__final__", reply.strip(), raw_json)