Spaces:

Shrot102
/

llmopt-server

Sleeping

App Files Files Community

llmopt-server / llmopt /optimizer /prompt_optimizer.py

Shrot101

feat: upgrade LLMOpt to V2 ML-powered architecture

eff2120 21 days ago

raw

history blame contribute delete

8.65 kB

	"""
	Prompt Optimizer — reduces token usage in prompts while preserving
	semantic content.

	V1 strategy:
	- Whitespace normalization
	- Redundant preamble removal
	- Instruction conciseness rewriting
	- Conversation history summarization (stub)
	- System prompt selection

	V2: LLMLingua semantic compression (if llmlingua installed)
	Falls back to V1 heuristics if not available.
	"""

	from __future__ import annotations

	import logging
	import re
	from dataclasses import dataclass
	from typing import Optional

	logger = logging.getLogger(__name__)


	@dataclass
	class OptimizedPrompt:
	original_query: str
	optimized_query: str
	system_prompt: str
	original_tokens: int
	optimized_tokens: int
	tokens_saved: int
	compression_ratio: float
	techniques_applied: list[str]

	def to_dict(self) -> dict:
	return self.__dict__.copy()


	# ---------------------------------------------------------------------------
	# System prompt templates
	# ---------------------------------------------------------------------------

	_SYSTEM_PROMPTS = {
	"verbose": (
	"You are a helpful, accurate, and thorough assistant. "
	"Provide complete, well-structured answers. "
	"Think step by step when solving complex problems."
	),
	"concise": (
	"You are a helpful assistant. Be clear and accurate. "
	"Avoid unnecessary verbosity."
	),
	"minimal": "Answer concisely and accurately.",
	}

	# Patterns that add no semantic value
	_FILLER_PATTERNS = [
	r"(?i)^(please\s+)?can\s+you\s+(please\s+)?",
	r"(?i)^i\s+(would\s+like\|want\|need)\s+(you\s+to\s+)?",
	r"(?i)^could\s+you\s+(please\s+)?",
	r"(?i)^hey\s+(there\s+)?(claude\|assistant\|ai\|chatgpt)[\s,!]*",
	r"(?i)^(hi\|hello\|hey)[,!.\s]+",
	r"(?i)\s+(please\|thank\s+you\|thanks)[.!]?\s*$",
	r"(?i)^(i\s+)?(was\s+)?wondering\s+(if\s+)?(you\s+)?(could\|can)\s+",
	]

	# Verbose instruction phrases → concise alternatives
	_INSTRUCTION_REWRITES = [
	(r"(?i)provide\s+a\s+detailed\s+explanation\s+of", "explain"),
	(r"(?i)give\s+me\s+a\s+comprehensive\s+overview\s+of", "overview:"),
	(r"(?i)i\s+need\s+you\s+to\s+write\s+a\s+", "write a "),
	(r"(?i)could\s+you\s+please\s+explain", "explain"),
	(r"(?i)what\s+is\s+the\s+best\s+way\s+to", "best way to"),
	(r"(?i)in\s+simple\s+terms,?\s+", ""),
	(r"(?i)for\s+a\s+(complete\s+)?beginner[\s,]+", ""),
	]


	class PromptOptimizer:
	"""
	Optimizes prompts to minimize token usage.

	V2: Uses LLMLingua for semantic compression when installed.
	Falls back to V1 heuristic compression (filler removal, rewrites) if not.
	"""

	def __init__(self):
	self._llmlingua = None
	try:
	from llmlingua import PromptCompressor # type: ignore
	logger.info("PromptOptimizer: Loading LLMLingua compressor...")
	self._llmlingua = PromptCompressor(
	model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
	use_llmlingua2=True,
	device_map="cpu",
	)
	logger.info("PromptOptimizer: LLMLingua ready!")
	except ImportError:
	logger.info("PromptOptimizer: llmlingua not installed. Using V1 heuristic compression.")
	except Exception as e:
	logger.warning(f"PromptOptimizer: Failed to load LLMLingua: {e}. Using V1 heuristics.")

	def optimize(
	self,
	query: str,
	system_prompt_style: str = "concise",
	compression_enabled: bool = True,
	conversation_history: Optional[list[dict]] = None,
	) -> OptimizedPrompt:
	"""
	Returns an OptimizedPrompt with the reduced query and chosen system prompt.
	"""
	techniques: list[str] = []
	optimized = query

	if compression_enabled:
	optimized, applied = self._compress(optimized)
	techniques.extend(applied)

	system_prompt = _SYSTEM_PROMPTS.get(system_prompt_style, _SYSTEM_PROMPTS["concise"])

	# Summarize history if provided (stub — V2 will use LLM summarization)
	if conversation_history:
	system_prompt += self._summarize_history(conversation_history)
	techniques.append("history_summarization_stub")

	orig_tokens = self._count_tokens(query)
	opt_tokens = self._count_tokens(optimized)
	saved = max(0, orig_tokens - opt_tokens)

	return OptimizedPrompt(
	original_query=query,
	optimized_query=optimized,
	system_prompt=system_prompt,
	original_tokens=orig_tokens,
	optimized_tokens=opt_tokens,
	tokens_saved=saved,
	compression_ratio=round(saved / max(orig_tokens, 1), 3),
	techniques_applied=techniques,
	)

	# ------------------------------------------------------------------
	# Compression pipeline
	# ------------------------------------------------------------------

	def _compress(self, text: str) -> tuple[str, list[str]]:
	techniques = []

	# --- V2: LLMLingua Semantic Compression ---
	if self._llmlingua and len(text.split()) > 15:
	try:
	result = self._llmlingua.compress_prompt(
	[text],
	rate=0.6, # Keep 60% of tokens
	force_tokens=["?"], # Always keep question marks
	)
	compressed = result["compressed_prompt"].strip()
	# Only use if it actually saved tokens and isn't empty
	if compressed and len(compressed.split()) < len(text.split()):
	techniques.append("llmlingua_semantic_compression")
	return compressed, techniques
	except Exception as e:
	logger.warning(f"LLMLingua compression failed: {e}. Falling back to V1.")

	# --- V1: Heuristic Compression ---
	original = text

	# 1. Whitespace normalization
	cleaned = re.sub(r"\s+", " ", text).strip()
	if cleaned != text:
	techniques.append("whitespace_normalization")

	# 2. Remove filler preambles
	for pattern in _FILLER_PATTERNS:
	new = re.sub(pattern, "", cleaned).strip()
	if new != cleaned:
	cleaned = new
	if "filler_removal" not in techniques:
	techniques.append("filler_removal")

	# Capitalize first letter after removal
	if cleaned and cleaned[0].islower():
	cleaned = cleaned[0].upper() + cleaned[1:]

	# 3. Instruction conciseness rewrites
	for pattern, replacement in _INSTRUCTION_REWRITES:
	new = re.sub(pattern, replacement, cleaned)
	if new != cleaned:
	cleaned = new
	if "instruction_rewrite" not in techniques:
	techniques.append("instruction_rewrite")

	# 4. Deduplicate consecutive whitespace again after rewrites
	cleaned = re.sub(r"\s+", " ", cleaned).strip()

	return cleaned, techniques

	# ------------------------------------------------------------------
	# History summarization (stub)
	# ------------------------------------------------------------------

	def _summarize_history(self, history: list[dict]) -> str:
	"""
	V1: truncate to last 3 turns.
	V2: call a cheap LLM to produce a compressed memory string.
	"""
	if len(history) <= 3:
	context = "\n".join(
	f"{m['role'].upper()}: {m['content'][:200]}" for m in history
	)
	else:
	recent = history[-3:]
	context = f"[{len(history) - 3} earlier turns omitted]\n" + "\n".join(
	f"{m['role'].upper()}: {m['content'][:200]}" for m in recent
	)
	return f"\n\nConversation context:\n{context}"

	# ------------------------------------------------------------------
	# Token counting
	# ------------------------------------------------------------------

	@staticmethod
	def _count_tokens(text: str) -> int:
	"""
	Approximate token count. Uses tiktoken if available and network
	accessible, otherwise falls back to word-based estimate (÷ 0.75).
	The fallback is accurate to within ~10% for English text.
	"""
	try:
	import tiktoken # type: ignore
	enc = tiktoken.get_encoding("cl100k_base")
	return len(enc.encode(text))
	except Exception:
	# Fallback: GPT tokenizers average ~0.75 words per token
	return int(len(text.split()) / 0.75)