""" Prompt Optimizer โ€” reduces token usage in prompts while preserving semantic content. V1 strategy: - Whitespace normalization - Redundant preamble removal - Instruction conciseness rewriting - Conversation history summarization (stub) - System prompt selection V2: LLMLingua semantic compression (if llmlingua installed) Falls back to V1 heuristics if not available. """ from __future__ import annotations import logging import re from dataclasses import dataclass from typing import Optional logger = logging.getLogger(__name__) @dataclass class OptimizedPrompt: original_query: str optimized_query: str system_prompt: str original_tokens: int optimized_tokens: int tokens_saved: int compression_ratio: float techniques_applied: list[str] def to_dict(self) -> dict: return self.__dict__.copy() # --------------------------------------------------------------------------- # System prompt templates # --------------------------------------------------------------------------- _SYSTEM_PROMPTS = { "verbose": ( "You are a helpful, accurate, and thorough assistant. " "Provide complete, well-structured answers. " "Think step by step when solving complex problems." ), "concise": ( "You are a helpful assistant. Be clear and accurate. " "Avoid unnecessary verbosity." ), "minimal": "Answer concisely and accurately.", } # Patterns that add no semantic value _FILLER_PATTERNS = [ r"(?i)^(please\s+)?can\s+you\s+(please\s+)?", r"(?i)^i\s+(would\s+like|want|need)\s+(you\s+to\s+)?", r"(?i)^could\s+you\s+(please\s+)?", r"(?i)^hey\s+(there\s+)?(claude|assistant|ai|chatgpt)[\s,!]*", r"(?i)^(hi|hello|hey)[,!.\s]+", r"(?i)\s+(please|thank\s+you|thanks)[.!]?\s*$", r"(?i)^(i\s+)?(was\s+)?wondering\s+(if\s+)?(you\s+)?(could|can)\s+", ] # Verbose instruction phrases โ†’ concise alternatives _INSTRUCTION_REWRITES = [ (r"(?i)provide\s+a\s+detailed\s+explanation\s+of", "explain"), (r"(?i)give\s+me\s+a\s+comprehensive\s+overview\s+of", "overview:"), (r"(?i)i\s+need\s+you\s+to\s+write\s+a\s+", "write a "), (r"(?i)could\s+you\s+please\s+explain", "explain"), (r"(?i)what\s+is\s+the\s+best\s+way\s+to", "best way to"), (r"(?i)in\s+simple\s+terms,?\s+", ""), (r"(?i)for\s+a\s+(complete\s+)?beginner[\s,]+", ""), ] class PromptOptimizer: """ Optimizes prompts to minimize token usage. V2: Uses LLMLingua for semantic compression when installed. Falls back to V1 heuristic compression (filler removal, rewrites) if not. """ def __init__(self): self._llmlingua = None try: from llmlingua import PromptCompressor # type: ignore logger.info("PromptOptimizer: Loading LLMLingua compressor...") self._llmlingua = PromptCompressor( model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank", use_llmlingua2=True, device_map="cpu", ) logger.info("PromptOptimizer: LLMLingua ready!") except ImportError: logger.info("PromptOptimizer: llmlingua not installed. Using V1 heuristic compression.") except Exception as e: logger.warning(f"PromptOptimizer: Failed to load LLMLingua: {e}. Using V1 heuristics.") def optimize( self, query: str, system_prompt_style: str = "concise", compression_enabled: bool = True, conversation_history: Optional[list[dict]] = None, ) -> OptimizedPrompt: """ Returns an OptimizedPrompt with the reduced query and chosen system prompt. """ techniques: list[str] = [] optimized = query if compression_enabled: optimized, applied = self._compress(optimized) techniques.extend(applied) system_prompt = _SYSTEM_PROMPTS.get(system_prompt_style, _SYSTEM_PROMPTS["concise"]) # Summarize history if provided (stub โ€” V2 will use LLM summarization) if conversation_history: system_prompt += self._summarize_history(conversation_history) techniques.append("history_summarization_stub") orig_tokens = self._count_tokens(query) opt_tokens = self._count_tokens(optimized) saved = max(0, orig_tokens - opt_tokens) return OptimizedPrompt( original_query=query, optimized_query=optimized, system_prompt=system_prompt, original_tokens=orig_tokens, optimized_tokens=opt_tokens, tokens_saved=saved, compression_ratio=round(saved / max(orig_tokens, 1), 3), techniques_applied=techniques, ) # ------------------------------------------------------------------ # Compression pipeline # ------------------------------------------------------------------ def _compress(self, text: str) -> tuple[str, list[str]]: techniques = [] # --- V2: LLMLingua Semantic Compression --- if self._llmlingua and len(text.split()) > 15: try: result = self._llmlingua.compress_prompt( [text], rate=0.6, # Keep 60% of tokens force_tokens=["?"], # Always keep question marks ) compressed = result["compressed_prompt"].strip() # Only use if it actually saved tokens and isn't empty if compressed and len(compressed.split()) < len(text.split()): techniques.append("llmlingua_semantic_compression") return compressed, techniques except Exception as e: logger.warning(f"LLMLingua compression failed: {e}. Falling back to V1.") # --- V1: Heuristic Compression --- original = text # 1. Whitespace normalization cleaned = re.sub(r"\s+", " ", text).strip() if cleaned != text: techniques.append("whitespace_normalization") # 2. Remove filler preambles for pattern in _FILLER_PATTERNS: new = re.sub(pattern, "", cleaned).strip() if new != cleaned: cleaned = new if "filler_removal" not in techniques: techniques.append("filler_removal") # Capitalize first letter after removal if cleaned and cleaned[0].islower(): cleaned = cleaned[0].upper() + cleaned[1:] # 3. Instruction conciseness rewrites for pattern, replacement in _INSTRUCTION_REWRITES: new = re.sub(pattern, replacement, cleaned) if new != cleaned: cleaned = new if "instruction_rewrite" not in techniques: techniques.append("instruction_rewrite") # 4. Deduplicate consecutive whitespace again after rewrites cleaned = re.sub(r"\s+", " ", cleaned).strip() return cleaned, techniques # ------------------------------------------------------------------ # History summarization (stub) # ------------------------------------------------------------------ def _summarize_history(self, history: list[dict]) -> str: """ V1: truncate to last 3 turns. V2: call a cheap LLM to produce a compressed memory string. """ if len(history) <= 3: context = "\n".join( f"{m['role'].upper()}: {m['content'][:200]}" for m in history ) else: recent = history[-3:] context = f"[{len(history) - 3} earlier turns omitted]\n" + "\n".join( f"{m['role'].upper()}: {m['content'][:200]}" for m in recent ) return f"\n\nConversation context:\n{context}" # ------------------------------------------------------------------ # Token counting # ------------------------------------------------------------------ @staticmethod def _count_tokens(text: str) -> int: """ Approximate token count. Uses tiktoken if available and network accessible, otherwise falls back to word-based estimate (รท 0.75). The fallback is accurate to within ~10% for English text. """ try: import tiktoken # type: ignore enc = tiktoken.get_encoding("cl100k_base") return len(enc.encode(text)) except Exception: # Fallback: GPT tokenizers average ~0.75 words per token return int(len(text.split()) / 0.75)