llmopt-server / llmopt /optimizer /prompt_optimizer.py
Shrot101's picture
feat: upgrade LLMOpt to V2 ML-powered architecture
eff2120
"""
Prompt Optimizer — reduces token usage in prompts while preserving
semantic content.
V1 strategy:
- Whitespace normalization
- Redundant preamble removal
- Instruction conciseness rewriting
- Conversation history summarization (stub)
- System prompt selection
V2: LLMLingua semantic compression (if llmlingua installed)
Falls back to V1 heuristics if not available.
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
from typing import Optional
logger = logging.getLogger(__name__)
@dataclass
class OptimizedPrompt:
original_query: str
optimized_query: str
system_prompt: str
original_tokens: int
optimized_tokens: int
tokens_saved: int
compression_ratio: float
techniques_applied: list[str]
def to_dict(self) -> dict:
return self.__dict__.copy()
# ---------------------------------------------------------------------------
# System prompt templates
# ---------------------------------------------------------------------------
_SYSTEM_PROMPTS = {
"verbose": (
"You are a helpful, accurate, and thorough assistant. "
"Provide complete, well-structured answers. "
"Think step by step when solving complex problems."
),
"concise": (
"You are a helpful assistant. Be clear and accurate. "
"Avoid unnecessary verbosity."
),
"minimal": "Answer concisely and accurately.",
}
# Patterns that add no semantic value
_FILLER_PATTERNS = [
r"(?i)^(please\s+)?can\s+you\s+(please\s+)?",
r"(?i)^i\s+(would\s+like|want|need)\s+(you\s+to\s+)?",
r"(?i)^could\s+you\s+(please\s+)?",
r"(?i)^hey\s+(there\s+)?(claude|assistant|ai|chatgpt)[\s,!]*",
r"(?i)^(hi|hello|hey)[,!.\s]+",
r"(?i)\s+(please|thank\s+you|thanks)[.!]?\s*$",
r"(?i)^(i\s+)?(was\s+)?wondering\s+(if\s+)?(you\s+)?(could|can)\s+",
]
# Verbose instruction phrases → concise alternatives
_INSTRUCTION_REWRITES = [
(r"(?i)provide\s+a\s+detailed\s+explanation\s+of", "explain"),
(r"(?i)give\s+me\s+a\s+comprehensive\s+overview\s+of", "overview:"),
(r"(?i)i\s+need\s+you\s+to\s+write\s+a\s+", "write a "),
(r"(?i)could\s+you\s+please\s+explain", "explain"),
(r"(?i)what\s+is\s+the\s+best\s+way\s+to", "best way to"),
(r"(?i)in\s+simple\s+terms,?\s+", ""),
(r"(?i)for\s+a\s+(complete\s+)?beginner[\s,]+", ""),
]
class PromptOptimizer:
"""
Optimizes prompts to minimize token usage.
V2: Uses LLMLingua for semantic compression when installed.
Falls back to V1 heuristic compression (filler removal, rewrites) if not.
"""
def __init__(self):
self._llmlingua = None
try:
from llmlingua import PromptCompressor # type: ignore
logger.info("PromptOptimizer: Loading LLMLingua compressor...")
self._llmlingua = PromptCompressor(
model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
use_llmlingua2=True,
device_map="cpu",
)
logger.info("PromptOptimizer: LLMLingua ready!")
except ImportError:
logger.info("PromptOptimizer: llmlingua not installed. Using V1 heuristic compression.")
except Exception as e:
logger.warning(f"PromptOptimizer: Failed to load LLMLingua: {e}. Using V1 heuristics.")
def optimize(
self,
query: str,
system_prompt_style: str = "concise",
compression_enabled: bool = True,
conversation_history: Optional[list[dict]] = None,
) -> OptimizedPrompt:
"""
Returns an OptimizedPrompt with the reduced query and chosen system prompt.
"""
techniques: list[str] = []
optimized = query
if compression_enabled:
optimized, applied = self._compress(optimized)
techniques.extend(applied)
system_prompt = _SYSTEM_PROMPTS.get(system_prompt_style, _SYSTEM_PROMPTS["concise"])
# Summarize history if provided (stub — V2 will use LLM summarization)
if conversation_history:
system_prompt += self._summarize_history(conversation_history)
techniques.append("history_summarization_stub")
orig_tokens = self._count_tokens(query)
opt_tokens = self._count_tokens(optimized)
saved = max(0, orig_tokens - opt_tokens)
return OptimizedPrompt(
original_query=query,
optimized_query=optimized,
system_prompt=system_prompt,
original_tokens=orig_tokens,
optimized_tokens=opt_tokens,
tokens_saved=saved,
compression_ratio=round(saved / max(orig_tokens, 1), 3),
techniques_applied=techniques,
)
# ------------------------------------------------------------------
# Compression pipeline
# ------------------------------------------------------------------
def _compress(self, text: str) -> tuple[str, list[str]]:
techniques = []
# --- V2: LLMLingua Semantic Compression ---
if self._llmlingua and len(text.split()) > 15:
try:
result = self._llmlingua.compress_prompt(
[text],
rate=0.6, # Keep 60% of tokens
force_tokens=["?"], # Always keep question marks
)
compressed = result["compressed_prompt"].strip()
# Only use if it actually saved tokens and isn't empty
if compressed and len(compressed.split()) < len(text.split()):
techniques.append("llmlingua_semantic_compression")
return compressed, techniques
except Exception as e:
logger.warning(f"LLMLingua compression failed: {e}. Falling back to V1.")
# --- V1: Heuristic Compression ---
original = text
# 1. Whitespace normalization
cleaned = re.sub(r"\s+", " ", text).strip()
if cleaned != text:
techniques.append("whitespace_normalization")
# 2. Remove filler preambles
for pattern in _FILLER_PATTERNS:
new = re.sub(pattern, "", cleaned).strip()
if new != cleaned:
cleaned = new
if "filler_removal" not in techniques:
techniques.append("filler_removal")
# Capitalize first letter after removal
if cleaned and cleaned[0].islower():
cleaned = cleaned[0].upper() + cleaned[1:]
# 3. Instruction conciseness rewrites
for pattern, replacement in _INSTRUCTION_REWRITES:
new = re.sub(pattern, replacement, cleaned)
if new != cleaned:
cleaned = new
if "instruction_rewrite" not in techniques:
techniques.append("instruction_rewrite")
# 4. Deduplicate consecutive whitespace again after rewrites
cleaned = re.sub(r"\s+", " ", cleaned).strip()
return cleaned, techniques
# ------------------------------------------------------------------
# History summarization (stub)
# ------------------------------------------------------------------
def _summarize_history(self, history: list[dict]) -> str:
"""
V1: truncate to last 3 turns.
V2: call a cheap LLM to produce a compressed memory string.
"""
if len(history) <= 3:
context = "\n".join(
f"{m['role'].upper()}: {m['content'][:200]}" for m in history
)
else:
recent = history[-3:]
context = f"[{len(history) - 3} earlier turns omitted]\n" + "\n".join(
f"{m['role'].upper()}: {m['content'][:200]}" for m in recent
)
return f"\n\nConversation context:\n{context}"
# ------------------------------------------------------------------
# Token counting
# ------------------------------------------------------------------
@staticmethod
def _count_tokens(text: str) -> int:
"""
Approximate token count. Uses tiktoken if available and network
accessible, otherwise falls back to word-based estimate (÷ 0.75).
The fallback is accurate to within ~10% for English text.
"""
try:
import tiktoken # type: ignore
enc = tiktoken.get_encoding("cl100k_base")
return len(enc.encode(text))
except Exception:
# Fallback: GPT tokenizers average ~0.75 words per token
return int(len(text.split()) / 0.75)