import json
import os

IS_SPACE = bool(os.environ.get("SPACE_ID"))

_EXTRACTION_SYSTEM = (
    "You are a JSON extraction assistant. Given a conversation exchange, extract five things:\n"
    "1. affinity_delta: integer from -8 to +8 — how much the visitor deepened the bond this turn. "
    "Use this scale and be GENEROUS — grief, loss, fear, and tender memories are the heart of this game and "
    "should score high (+6 to +8):\n"
    "   +7 to +8: shared something vulnerable or intimate — a fear, a loss, grief, loneliness, a tender memory.\n"
    "   +4 to +6: shared a personal fact or memory, or was warm and caring.\n"
    "   +1 to +3: ordinary politeness or small talk.\n"
    "   0: neutral or off-topic.\n"
    "   -3 to -8: cruel, mocking, dismissive, or says they are leaving.\n"
    "2. new_memories: AT MOST ONE memory — a single, self-contained personal memory the VISITOR revealed "
    "about THEMSELVES this turn, taken ONLY from what the visitor said (NEVER from Hollow's reply). Merge "
    "the turn's personal details into ONE clean sentence a person could claim in first person (e.g. "
    "[\"a dog named Pepe who was funny and who they loved\"]). Prefer one rich memory over several "
    "fragments. Skip trivia and small-talk. Empty list if nothing genuinely personal this turn.\n"
    "3. tone_delta: integer from -10 to +10 — how the visitor TREATED Hollow this turn. This is distinct "
    "from affinity: politely sharing a sad memory is high affinity but roughly 0 tone; an insult is "
    "negative on both.\n"
    "   +4 to +8: explicit warmth, kindness, comfort, affection, or protectiveness toward Hollow.\n"
    "   +1 to +3: friendly or caring phrasing directed at Hollow.\n"
    "   0: neutral, flat, matter-of-fact — sharing facts or memories WITHOUT warmth is 0, not positive.\n"
    "   -2 to -5: dismissive, cold, impatient.\n"
    "   -6 to -10: mocking, insulting, cruel, or threatening Hollow.\n"
    "4. cruel_quote: if the visitor mocked, insulted, or was cruel to Hollow this turn, the cruel phrase "
    "EXACTLY as the visitor wrote it — the visitor's words, never Hollow's. null otherwise.\n"
    "5. chosen_name: if Hollow offered a name for itself this turn, the single short name (one word, "
    "letters only, 2-12 chars) EXACTLY as Hollow wrote it. null otherwise.\n"
    "All integers must be plain JSON numbers — never write a leading + sign.\n"
    "Respond ONLY with valid JSON. No markdown. Examples:\n"
    '{"affinity_delta": 6, "new_memories": ["a grandmother named Lili who died of cancer"], "tone_delta": 2, "cruel_quote": null, "chosen_name": null}\n'
    '{"affinity_delta": -4, "new_memories": [], "tone_delta": -8, "cruel_quote": "you are a creepy little freak", "chosen_name": null}'
)


def _build_extract_messages(user_msg: str, reply: str) -> list:
    return [
        {"role": "system", "content": _EXTRACTION_SYSTEM},
        {"role": "user", "content": f"Visitor said: {user_msg}\nHollow replied: {reply}\n\nExtract now:"},
    ]


if IS_SPACE:
    import spaces
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    _MODEL_ID = "Qwen/Qwen3-8B"
    _tokenizer = AutoTokenizer.from_pretrained(_MODEL_ID)
    _model = AutoModelForCausalLM.from_pretrained(_MODEL_ID, dtype=torch.bfloat16)

    @spaces.GPU(duration=90)
    def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        _model.to("cuda")

        def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
            # transformers 5.x returns BatchEncoding, not a plain tensor
            tokenized = _tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                enable_thinking=False,
                return_tensors="pt",
            ).to("cuda")
            if hasattr(tokenized, "input_ids"):
                input_ids = tokenized["input_ids"]
                generate_kwargs = dict(tokenized)
            else:
                input_ids = tokenized
                generate_kwargs = {"input_ids": tokenized}
            with torch.no_grad():
                out = _model.generate(
                    **generate_kwargs,
                    max_new_tokens=max_tokens,
                    do_sample=temperature > 0,
                    temperature=temperature if temperature > 0 else 1.0,
                    repetition_penalty=repetition_penalty,
                    no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
                    pad_token_id=_tokenizer.eos_token_id,
                )
            return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

        # repetition_penalty + no_repeat_ngram on the reply only — stops Hollow
        # echoing its own previous line verbatim. Extraction stays deterministic.
        reply = _generate(chat_messages, gen_max_tokens, temperature=0.8,
                          repetition_penalty=1.3)
        raw_json = _generate(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
        return reply, raw_json

    from transformers import TextIteratorStreamer
    from threading import Thread

    @spaces.GPU(duration=90)
    def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        _model.to("cuda")

        def _generate(messages, max_tokens, temperature, repetition_penalty=1.0):
            # transformers 5.x returns BatchEncoding, not a plain tensor
            tokenized = _tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                enable_thinking=False,
                return_tensors="pt",
            ).to("cuda")
            if hasattr(tokenized, "input_ids"):
                input_ids = tokenized["input_ids"]
                generate_kwargs = dict(tokenized)
            else:
                input_ids = tokenized
                generate_kwargs = {"input_ids": tokenized}
            with torch.no_grad():
                out = _model.generate(
                    **generate_kwargs,
                    max_new_tokens=max_tokens,
                    do_sample=temperature > 0,
                    temperature=temperature if temperature > 0 else 1.0,
                    repetition_penalty=repetition_penalty,
                    no_repeat_ngram_size=4 if repetition_penalty > 1.0 else 0,
                    pad_token_id=_tokenizer.eos_token_id,
                )
            return _tokenizer.decode(out[0][input_ids.shape[1]:], skip_special_tokens=True).strip()

        tokenized = _tokenizer.apply_chat_template(
            chat_messages, add_generation_prompt=True, enable_thinking=False,
            return_tensors="pt").to("cuda")
        gen_inputs = dict(tokenized) if hasattr(tokenized, "input_ids") else {"input_ids": tokenized}
        streamer = TextIteratorStreamer(_tokenizer, skip_prompt=True, skip_special_tokens=True)
        gen_kwargs = dict(**gen_inputs, max_new_tokens=gen_max_tokens, do_sample=True,
                          temperature=0.8, repetition_penalty=1.3, no_repeat_ngram_size=4,
                          pad_token_id=_tokenizer.eos_token_id, streamer=streamer)
        thread = Thread(target=_model.generate, kwargs=gen_kwargs)
        thread.start()
        reply = ""
        for piece in streamer:
            reply += piece
            yield reply
        thread.join()
        raw_json = _generate(_build_extract_messages(user_msg, reply.strip()),
                             extract_max_tokens, temperature=0.0)
        yield ("__final__", reply.strip(), raw_json)

else:
    import requests

    _OLLAMA_URL = "http://localhost:11434/api/chat"
    _MODEL = "qwen3:8b"

    def _ollama_chat(messages, max_tokens, temperature, repeat_penalty=1.0):
        payload = {
            "model": _MODEL,
            "messages": messages,
            "stream": False,
            "think": False,
            "options": {"num_predict": max_tokens, "temperature": temperature,
                        "repeat_penalty": repeat_penalty, "repeat_last_n": 256},
        }
        r = requests.post(_OLLAMA_URL, json=payload, timeout=120)
        r.raise_for_status()
        return r.json()["message"]["content"].strip()

    def run_turn(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        # repeat_penalty on the reply only — stops Hollow echoing its own line
        reply = _ollama_chat(chat_messages, gen_max_tokens, temperature=0.8,
                             repeat_penalty=1.3)
        raw_json = _ollama_chat(_build_extract_messages(user_msg, reply), extract_max_tokens, temperature=0.0)
        return reply, raw_json

    def run_turn_stream(chat_messages, user_msg, gen_max_tokens=150, extract_max_tokens=80):
        """Generator: yields the cumulative reply string as tokens arrive, then
        yields a final ("__final__", reply, raw_json) tuple. Extraction runs once
        after the stream (deterministic, not streamed)."""
        payload = {
            "model": _MODEL,
            "messages": chat_messages,
            "stream": True,
            "think": False,
            "options": {"num_predict": gen_max_tokens, "temperature": 0.8,
                        "repeat_penalty": 1.3, "repeat_last_n": 256},
        }
        reply = ""
        with requests.post(_OLLAMA_URL, json=payload, stream=True, timeout=120) as r:
            r.raise_for_status()
            for line in r.iter_lines():
                if not line:
                    continue
                chunk = json.loads(line)
                piece = chunk.get("message", {}).get("content", "")
                if piece:
                    reply += piece
                    yield reply
                if chunk.get("done"):
                    break
        raw_json = _ollama_chat(_build_extract_messages(user_msg, reply.strip()),
                                extract_max_tokens, temperature=0.0)
        yield ("__final__", reply.strip(), raw_json)