docops-agent / dspy_lab /optimize.py
RamsesCamas's picture
Initial deploy of DocOps Agent
87bb7af
"""Pipeline de compilación COMPLETA con MIPROv2.
Correr OFFLINE antes de clase. Toma ~10-15 minutos y puede quemar varios
miles de tokens de Groq. Cachea el resultado en JSON — si ya existe, aborta
(pasar --force para recompilar).
Output:
- dspy_lab/qa_optimized_mipro.json (programa compilado)
- evals/reports/mipro_report.md (comparativa baseline vs optimized)
"""
from __future__ import annotations
import argparse
import os
import time
from pathlib import Path
import dspy
import litellm
from dspy.teleprompt import MIPROv2
from rich.console import Console
from data.loader import load_eval_set
from dspy_lab.metric import docops_metric
from dspy_lab.signatures import AnswerWithContext
OUT_PROGRAM = Path("dspy_lab/qa_optimized_mipro.json")
OUT_REPORT = Path("evals/reports/mipro_report.md")
def _avg_score(program: dspy.Module, examples: list[dspy.Example]) -> float:
scores: list[float] = []
for ex in examples:
try:
pred = program(context=ex.context, question=ex.question)
scores.append(docops_metric(ex, pred))
except Exception:
scores.append(0.0)
return sum(scores) / len(scores) if scores else 0.0
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--force", action="store_true", help="Recompilar aunque el cache exista."
)
args = parser.parse_args()
console = Console()
console.print("[bold cyan]dspy_lab/optimize.py — MIPROv2 full compile[/bold cyan]")
if OUT_PROGRAM.exists() and not args.force:
console.print(f"[yellow]cache hit[/yellow] → {OUT_PROGRAM}")
console.print("Usa --force para recompilar.")
return
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
raise SystemExit(
"[error] GROQ_API_KEY no está definido. Expórtalo o añádelo a .env."
)
# Groq free tier: 8k TPM para openai/gpt-oss-120b.
# Configuramos retries agresivos en litellm para que espere automáticamente.
litellm.num_retries = 5
litellm.request_timeout = 120
lm = dspy.LM(
"groq/openai/gpt-oss-120b",
api_key=api_key,
temperature=0.0,
max_tokens=2048,
)
dspy.configure(lm=lm)
trainset, valset, testset = load_eval_set(splits=(0.6, 0.2, 0.2))
console.print(
f"train={len(trainset)} val={len(valset)} test={len(testset)}"
)
qa_baseline = dspy.ChainOfThought(AnswerWithContext)
console.print("Evaluando baseline...")
t0 = time.perf_counter()
baseline_score = _avg_score(qa_baseline, valset)
baseline_time = time.perf_counter() - t0
console.print(f" baseline: {baseline_score:.3f} ({baseline_time:.1f}s)")
console.print("Compilando con MIPROv2 (auto=light)...")
optimizer = MIPROv2(metric=docops_metric, auto="light")
t0 = time.perf_counter()
qa_optimized = optimizer.compile(
qa_baseline, trainset=trainset, valset=valset, requires_permission_to_run=False
)
compile_time = time.perf_counter() - t0
console.print(f" compile: {compile_time:.1f}s")
t0 = time.perf_counter()
optimized_score = _avg_score(qa_optimized, valset)
eval_time = time.perf_counter() - t0
console.print(f" optimized: {optimized_score:.3f} ({eval_time:.1f}s)")
OUT_PROGRAM.parent.mkdir(parents=True, exist_ok=True)
qa_optimized.save(str(OUT_PROGRAM))
console.print(f"Programa guardado en {OUT_PROGRAM}")
OUT_REPORT.parent.mkdir(parents=True, exist_ok=True)
delta = optimized_score - baseline_score
OUT_REPORT.write_text(
f"""# MIPROv2 compile report
## Config
- LM: `groq/openai/gpt-oss-120b`
- Metric: `docops_metric` (0.6 * faithfulness + 0.4 * answer_relevancy)
- Optimizer: `MIPROv2(auto="light")`
- Splits: train={len(trainset)} / val={len(valset)} / test={len(testset)}
## Results
| Program | docops_metric (avg val) |
|---------|------------------------:|
| Baseline (ChainOfThought) | {baseline_score:.3f} |
| Optimized (MIPROv2) | {optimized_score:.3f} |
| **Delta** | **{delta:+.3f}** |
## Times
- Baseline eval: {baseline_time:.1f}s
- Compile: {compile_time:.1f}s
- Optimized eval: {eval_time:.1f}s
""",
encoding="utf-8",
)
console.print(f"Reporte guardado en {OUT_REPORT}")
if __name__ == "__main__":
main()