Spaces:
Sleeping
Sleeping
| """ | |
| Prompt Optimizer — reduces token usage in prompts while preserving | |
| semantic content. | |
| V1 strategy: | |
| - Whitespace normalization | |
| - Redundant preamble removal | |
| - Instruction conciseness rewriting | |
| - Conversation history summarization (stub) | |
| - System prompt selection | |
| V2: LLMLingua semantic compression (if llmlingua installed) | |
| Falls back to V1 heuristics if not available. | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import re | |
| from dataclasses import dataclass | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class OptimizedPrompt: | |
| original_query: str | |
| optimized_query: str | |
| system_prompt: str | |
| original_tokens: int | |
| optimized_tokens: int | |
| tokens_saved: int | |
| compression_ratio: float | |
| techniques_applied: list[str] | |
| def to_dict(self) -> dict: | |
| return self.__dict__.copy() | |
| # --------------------------------------------------------------------------- | |
| # System prompt templates | |
| # --------------------------------------------------------------------------- | |
| _SYSTEM_PROMPTS = { | |
| "verbose": ( | |
| "You are a helpful, accurate, and thorough assistant. " | |
| "Provide complete, well-structured answers. " | |
| "Think step by step when solving complex problems." | |
| ), | |
| "concise": ( | |
| "You are a helpful assistant. Be clear and accurate. " | |
| "Avoid unnecessary verbosity." | |
| ), | |
| "minimal": "Answer concisely and accurately.", | |
| } | |
| # Patterns that add no semantic value | |
| _FILLER_PATTERNS = [ | |
| r"(?i)^(please\s+)?can\s+you\s+(please\s+)?", | |
| r"(?i)^i\s+(would\s+like|want|need)\s+(you\s+to\s+)?", | |
| r"(?i)^could\s+you\s+(please\s+)?", | |
| r"(?i)^hey\s+(there\s+)?(claude|assistant|ai|chatgpt)[\s,!]*", | |
| r"(?i)^(hi|hello|hey)[,!.\s]+", | |
| r"(?i)\s+(please|thank\s+you|thanks)[.!]?\s*$", | |
| r"(?i)^(i\s+)?(was\s+)?wondering\s+(if\s+)?(you\s+)?(could|can)\s+", | |
| ] | |
| # Verbose instruction phrases → concise alternatives | |
| _INSTRUCTION_REWRITES = [ | |
| (r"(?i)provide\s+a\s+detailed\s+explanation\s+of", "explain"), | |
| (r"(?i)give\s+me\s+a\s+comprehensive\s+overview\s+of", "overview:"), | |
| (r"(?i)i\s+need\s+you\s+to\s+write\s+a\s+", "write a "), | |
| (r"(?i)could\s+you\s+please\s+explain", "explain"), | |
| (r"(?i)what\s+is\s+the\s+best\s+way\s+to", "best way to"), | |
| (r"(?i)in\s+simple\s+terms,?\s+", ""), | |
| (r"(?i)for\s+a\s+(complete\s+)?beginner[\s,]+", ""), | |
| ] | |
| class PromptOptimizer: | |
| """ | |
| Optimizes prompts to minimize token usage. | |
| V2: Uses LLMLingua for semantic compression when installed. | |
| Falls back to V1 heuristic compression (filler removal, rewrites) if not. | |
| """ | |
| def __init__(self): | |
| self._llmlingua = None | |
| try: | |
| from llmlingua import PromptCompressor # type: ignore | |
| logger.info("PromptOptimizer: Loading LLMLingua compressor...") | |
| self._llmlingua = PromptCompressor( | |
| model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank", | |
| use_llmlingua2=True, | |
| device_map="cpu", | |
| ) | |
| logger.info("PromptOptimizer: LLMLingua ready!") | |
| except ImportError: | |
| logger.info("PromptOptimizer: llmlingua not installed. Using V1 heuristic compression.") | |
| except Exception as e: | |
| logger.warning(f"PromptOptimizer: Failed to load LLMLingua: {e}. Using V1 heuristics.") | |
| def optimize( | |
| self, | |
| query: str, | |
| system_prompt_style: str = "concise", | |
| compression_enabled: bool = True, | |
| conversation_history: Optional[list[dict]] = None, | |
| ) -> OptimizedPrompt: | |
| """ | |
| Returns an OptimizedPrompt with the reduced query and chosen system prompt. | |
| """ | |
| techniques: list[str] = [] | |
| optimized = query | |
| if compression_enabled: | |
| optimized, applied = self._compress(optimized) | |
| techniques.extend(applied) | |
| system_prompt = _SYSTEM_PROMPTS.get(system_prompt_style, _SYSTEM_PROMPTS["concise"]) | |
| # Summarize history if provided (stub — V2 will use LLM summarization) | |
| if conversation_history: | |
| system_prompt += self._summarize_history(conversation_history) | |
| techniques.append("history_summarization_stub") | |
| orig_tokens = self._count_tokens(query) | |
| opt_tokens = self._count_tokens(optimized) | |
| saved = max(0, orig_tokens - opt_tokens) | |
| return OptimizedPrompt( | |
| original_query=query, | |
| optimized_query=optimized, | |
| system_prompt=system_prompt, | |
| original_tokens=orig_tokens, | |
| optimized_tokens=opt_tokens, | |
| tokens_saved=saved, | |
| compression_ratio=round(saved / max(orig_tokens, 1), 3), | |
| techniques_applied=techniques, | |
| ) | |
| # ------------------------------------------------------------------ | |
| # Compression pipeline | |
| # ------------------------------------------------------------------ | |
| def _compress(self, text: str) -> tuple[str, list[str]]: | |
| techniques = [] | |
| # --- V2: LLMLingua Semantic Compression --- | |
| if self._llmlingua and len(text.split()) > 15: | |
| try: | |
| result = self._llmlingua.compress_prompt( | |
| [text], | |
| rate=0.6, # Keep 60% of tokens | |
| force_tokens=["?"], # Always keep question marks | |
| ) | |
| compressed = result["compressed_prompt"].strip() | |
| # Only use if it actually saved tokens and isn't empty | |
| if compressed and len(compressed.split()) < len(text.split()): | |
| techniques.append("llmlingua_semantic_compression") | |
| return compressed, techniques | |
| except Exception as e: | |
| logger.warning(f"LLMLingua compression failed: {e}. Falling back to V1.") | |
| # --- V1: Heuristic Compression --- | |
| original = text | |
| # 1. Whitespace normalization | |
| cleaned = re.sub(r"\s+", " ", text).strip() | |
| if cleaned != text: | |
| techniques.append("whitespace_normalization") | |
| # 2. Remove filler preambles | |
| for pattern in _FILLER_PATTERNS: | |
| new = re.sub(pattern, "", cleaned).strip() | |
| if new != cleaned: | |
| cleaned = new | |
| if "filler_removal" not in techniques: | |
| techniques.append("filler_removal") | |
| # Capitalize first letter after removal | |
| if cleaned and cleaned[0].islower(): | |
| cleaned = cleaned[0].upper() + cleaned[1:] | |
| # 3. Instruction conciseness rewrites | |
| for pattern, replacement in _INSTRUCTION_REWRITES: | |
| new = re.sub(pattern, replacement, cleaned) | |
| if new != cleaned: | |
| cleaned = new | |
| if "instruction_rewrite" not in techniques: | |
| techniques.append("instruction_rewrite") | |
| # 4. Deduplicate consecutive whitespace again after rewrites | |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() | |
| return cleaned, techniques | |
| # ------------------------------------------------------------------ | |
| # History summarization (stub) | |
| # ------------------------------------------------------------------ | |
| def _summarize_history(self, history: list[dict]) -> str: | |
| """ | |
| V1: truncate to last 3 turns. | |
| V2: call a cheap LLM to produce a compressed memory string. | |
| """ | |
| if len(history) <= 3: | |
| context = "\n".join( | |
| f"{m['role'].upper()}: {m['content'][:200]}" for m in history | |
| ) | |
| else: | |
| recent = history[-3:] | |
| context = f"[{len(history) - 3} earlier turns omitted]\n" + "\n".join( | |
| f"{m['role'].upper()}: {m['content'][:200]}" for m in recent | |
| ) | |
| return f"\n\nConversation context:\n{context}" | |
| # ------------------------------------------------------------------ | |
| # Token counting | |
| # ------------------------------------------------------------------ | |
| def _count_tokens(text: str) -> int: | |
| """ | |
| Approximate token count. Uses tiktoken if available and network | |
| accessible, otherwise falls back to word-based estimate (÷ 0.75). | |
| The fallback is accurate to within ~10% for English text. | |
| """ | |
| try: | |
| import tiktoken # type: ignore | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| return len(enc.encode(text)) | |
| except Exception: | |
| # Fallback: GPT tokenizers average ~0.75 words per token | |
| return int(len(text.split()) / 0.75) | |