import torch import config from core.tokenizer_utils import count_tokens from models.model_loader import get_llm try: import spaces _gpu = spaces.GPU except ImportError: _gpu = lambda fn: fn # no-op when running locally without the spaces package _PROMPT_TEMPLATE = """You are a lossless compression assistant. Compress the following text to at most {target} tokens. Preserve all key facts, decisions, and intent. Do not add commentary. Output only the compressed text. TEXT: {text} COMPRESSED:""" @_gpu def _generate(prompt: str) -> str: model, tokenizer = get_llm() device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) inputs = tokenizer(prompt, return_tensors="pt").to(device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=config.MAX_NEW_TOKENS, do_sample=False, pad_token_id=tokenizer.eos_token_id, ) new_tokens = output_ids[0][inputs["input_ids"].shape[1]:] return tokenizer.decode(new_tokens, skip_special_tokens=True).strip() def compress(text: str, target_tokens: int) -> tuple[str, int, int]: """Returns (compressed_text, input_token_count, output_token_count).""" input_tokens = count_tokens(text) if input_tokens <= target_tokens: return text, input_tokens, input_tokens prompt = _PROMPT_TEMPLATE.format(target=target_tokens, text=text) compressed = _generate(prompt) # Trim to hard token limit if model overshoots _, tokenizer = get_llm() ids = tokenizer.encode(compressed, add_special_tokens=False) if len(ids) > target_tokens: compressed = tokenizer.decode(ids[:target_tokens], skip_special_tokens=True) output_tokens = count_tokens(compressed) return compressed, input_tokens, output_tokens