"""Prompt engineering toolkit: templates, chains, registry, and evaluation.""" from __future__ import annotations import json import re import time import unicodedata from dataclasses import dataclass, field from typing import Any from core.logger import get_logger logger = get_logger(__name__) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _normalize(s: str) -> str: """Lowercase, strip, and remove accents for comparison.""" s = s.strip().lower() nfkd = unicodedata.normalize("NFKD", s) return "".join(c for c in nfkd if not unicodedata.combining(c)) def _extract_json(text: str) -> dict | None: """Try to parse JSON from LLM output, handling markdown wrappers.""" text = text.strip() # 1. Direct parse try: obj = json.loads(text) if isinstance(obj, dict): return obj except json.JSONDecodeError: pass # 2. Markdown code block match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL) if match: try: obj = json.loads(match.group(1).strip()) if isinstance(obj, dict): return obj except json.JSONDecodeError: pass # 3. Find first {...} (non-greedy, flat) match = re.search(r"\{[^{}]*\}", text) if match: try: obj = json.loads(match.group(0)) if isinstance(obj, dict): return obj except json.JSONDecodeError: pass return None # --------------------------------------------------------------------------- # PromptTemplate # --------------------------------------------------------------------------- class PromptTemplate: """A reusable prompt with variable substitution and metadata.""" def __init__( self, name: str, template: str, metadata: dict | None = None, ) -> None: self.name = name self.template = template self.metadata = metadata or {} def render(self, **kwargs: Any) -> str: """Substitute variables in the template.""" return self.template.format(**kwargs) def render_with_examples( self, examples: list[dict], **kwargs: Any ) -> str: """Format few-shot examples and inject them into the template. Each example dict should have ``input`` and ``output`` keys. The formatted block is passed as the ``{examples}`` variable. """ lines: list[str] = [] for ex in examples: lines.append(f"Input: {ex['input']}\nOutput: {ex['output']}") kwargs["examples"] = "\n\n".join(lines) return self.template.format(**kwargs) def __repr__(self) -> str: version = self.metadata.get("version", "?") return f"PromptTemplate(name={self.name!r}, version={version!r})" # --------------------------------------------------------------------------- # PromptRegistry # --------------------------------------------------------------------------- class PromptRegistry: """Centralized dictionary of PromptTemplates.""" def __init__(self) -> None: self._templates: dict[str, PromptTemplate] = {} def register(self, template: PromptTemplate) -> None: self._templates[template.name] = template def get(self, name: str) -> PromptTemplate: if name not in self._templates: raise KeyError(f"Template '{name}' not found in registry.") return self._templates[name] def list_all(self) -> list[str]: return list(self._templates.keys()) def get_version(self, name: str) -> str: return self.get(name).metadata.get("version", "unknown") # --------------------------------------------------------------------------- # PromptChain # --------------------------------------------------------------------------- @dataclass class ChainResult: """Result of running a PromptChain.""" steps: list[dict] final_response: str total_tokens: int total_latency_s: float class PromptChain: """Execute a sequence of PromptTemplates, piping outputs forward.""" def __init__(self, templates: list[PromptTemplate]) -> None: self.templates = templates def run(self, llm_client: Any, initial_input: dict) -> ChainResult: steps: list[dict] = [] current_vars = dict(initial_input) total_tokens = 0 total_latency_s = 0.0 for i, template in enumerate(self.templates): prompt = template.render(**current_vars) result = llm_client.chat(prompt) response_text = result["response"] metadata = result["metadata"] tokens = metadata["usage"]["total_tokens"] latency_s = metadata["latency_ms"] / 1000 steps.append({ "step": i + 1, "template": template.name, "prompt": prompt, "response": response_text, "tokens": tokens, "latency_s": latency_s, }) total_tokens += tokens total_latency_s += latency_s # Make output available for subsequent steps current_vars[f"step_{i + 1}_output"] = response_text current_vars["extraction_result"] = response_text return ChainResult( steps=steps, final_response=steps[-1]["response"] if steps else "", total_tokens=total_tokens, total_latency_s=total_latency_s, ) # --------------------------------------------------------------------------- # evaluate_prompt # --------------------------------------------------------------------------- @dataclass class EvalMetrics: """Metrics returned by evaluate_prompt.""" accuracy: float json_parse_rate: float campos_correctos_rate: float tokens_promedio: float latencia_promedio: float details: list[dict] = field(default_factory=list) def evaluate_prompt( prompt_or_chain: PromptTemplate | PromptChain, llm_client: Any, golden_set: list[dict], *, input_key: str = "ticket", delay: float = 1.0, verbose: bool = False, ) -> EvalMetrics: """Run *prompt_or_chain* against every example in *golden_set* and score. Returns accuracy, JSON parse rate, per-field correctness, average tokens, and average latency. """ total = len(golden_set) json_ok = 0 correct = 0 campos_correctos = 0 campos_total = 0 total_tokens = 0 total_latency = 0.0 details: list[dict] = [] for idx, item in enumerate(golden_set): input_text = item["input"] expected = item["expected"] # -- call the LLM ------------------------------------------------ try: if isinstance(prompt_or_chain, PromptChain): chain_result = prompt_or_chain.run( llm_client, {input_key: input_text} ) response_text = chain_result.final_response tokens = chain_result.total_tokens latency_s = chain_result.total_latency_s else: rendered = prompt_or_chain.render(**{input_key: input_text}) result = llm_client.chat(rendered) response_text = result["response"] tokens = result["metadata"]["usage"]["total_tokens"] latency_s = result["metadata"]["latency_ms"] / 1000 except Exception as exc: logger.warning("LLM error on item %d: %s", idx, exc) details.append({ "input": input_text, "expected": expected, "response": str(exc), "parsed": None, "correct": False, "json_valid": False, }) campos_total += 2 if delay > 0 and idx < total - 1: time.sleep(delay) continue total_tokens += tokens total_latency += latency_s # -- parse & compare --------------------------------------------- parsed = _extract_json(response_text) json_valid = parsed is not None if json_valid: json_ok += 1 cat_match = False pri_match = False if parsed: cat_match = ( _normalize(str(parsed.get("categoria", ""))) == _normalize(str(expected.get("categoria", ""))) ) pri_match = ( _normalize(str(parsed.get("prioridad", ""))) == _normalize(str(expected.get("prioridad", ""))) ) if cat_match: campos_correctos += 1 if pri_match: campos_correctos += 1 campos_total += 2 if cat_match and pri_match: correct += 1 detail = { "input": input_text, "expected": expected, "response": response_text, "parsed": parsed, "correct": cat_match and pri_match, "json_valid": json_valid, "cat_match": cat_match, "pri_match": pri_match, "tokens": tokens, "latency_s": latency_s, } details.append(detail) if verbose: status = "OK" if detail["correct"] else "FAIL" print(f" [{idx + 1}/{total}] {status} | {input_text[:60]}") if parsed: print(f" Esperado : {expected}") print(f" Obtenido : {parsed}") else: print(f" Respuesta (no JSON): {response_text[:120]}") if delay > 0 and idx < total - 1: time.sleep(delay) return EvalMetrics( accuracy=correct / total if total else 0, json_parse_rate=json_ok / total if total else 0, campos_correctos_rate=campos_correctos / campos_total if campos_total else 0, tokens_promedio=total_tokens / total if total else 0, latencia_promedio=total_latency / total if total else 0, details=details, )