import torch
import config
from core.tokenizer_utils import count_tokens
from models.model_loader import get_llm

try:
    import spaces
    _gpu = spaces.GPU
except ImportError:
    _gpu = lambda fn: fn  # no-op when running locally without the spaces package


_PROMPT_TEMPLATE = """You are a lossless compression assistant. Compress the following text to at most {target} tokens.
Preserve all key facts, decisions, and intent. Do not add commentary. Output only the compressed text.

TEXT:
{text}

COMPRESSED:"""


@_gpu
def _generate(prompt: str) -> str:
    model, tokenizer = get_llm()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=config.MAX_NEW_TOKENS,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )
    new_tokens = output_ids[0][inputs["input_ids"].shape[1]:]
    return tokenizer.decode(new_tokens, skip_special_tokens=True).strip()


def compress(text: str, target_tokens: int) -> tuple[str, int, int]:
    """Returns (compressed_text, input_token_count, output_token_count)."""
    input_tokens = count_tokens(text)

    if input_tokens <= target_tokens:
        return text, input_tokens, input_tokens

    prompt = _PROMPT_TEMPLATE.format(target=target_tokens, text=text)
    compressed = _generate(prompt)

    # Trim to hard token limit if model overshoots
    _, tokenizer = get_llm()
    ids = tokenizer.encode(compressed, add_special_tokens=False)
    if len(ids) > target_tokens:
        compressed = tokenizer.decode(ids[:target_tokens], skip_special_tokens=True)

    output_tokens = count_tokens(compressed)
    return compressed, input_tokens, output_tokens