Spaces:

xleaps
/

sgo

Running

Claude commited on 11 days ago

Commit

a8d5d4c

unverified ·

1 Parent(s): 0a66d61

Add optional metric calibration to web UI

After evaluation, users can optionally anchor SGO scores to a real-world
metric (CTR, conversion rate, revenue, etc.) by entering the current
known value. The gradient table then shows predicted metric deltas
alongside score deltas.

- Single anchor: linear scaling (metric = k * score)
- Multiple anchors: Platt scaling via Newton's method
- Collapsible UI panel in step 1 after evaluation results
- Calibration data included in downloaded reports
- Generic naming ("metric") not CTR-specific

https://claude.ai/code/session_0141cbZmdz7ziFkNsQbq7z5Y

Files changed (2) hide show

web/app.py +137 -0
web/static/index.html +168 -6

web/app.py CHANGED Viewed

@@ -39,6 +39,7 @@ import sys
 sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
 from evaluate import evaluate_one, analyze as analyze_eval, SYSTEM_PROMPT, BIAS_CALIBRATION_ADDENDUM
 from counterfactual import probe_one, analyze_gradient, build_changes_block, compute_goal_weights
 from generate_cohort import generate_segment
 from bias_audit import (
     reframe_entity, add_authority_signals, reorder_entity,
@@ -218,6 +219,15 @@ class CounterfactualConfig(BaseModel):
     parallel: int = 5
 class SuggestSegmentsInput(BaseModel):
     entity_text: str
     audience_context: str
@@ -298,7 +308,9 @@ async def create_session(entity: EntityInput):
         "cohort": None,
         "eval_results": None,
         "gradient": None,
         "bias_audit": None,
         "created": datetime.now().isoformat(),
     }
     return {"session_id": sid}
@@ -321,6 +333,101 @@ async def update_session_meta(sid: str, meta: SessionMetaUpdate):
     return {"ok": True}
 @app.get("/api/session/{sid}")
 async def get_session(sid: str):
     if sid not in sessions:
@@ -797,6 +904,10 @@ async def counterfactual_stream(sid: str, ticket: str, request: Request):
         gradient_text, ranked_data = analyze_gradient(results, all_changes,
                                                       goal_weights=goal_weights)
         session["gradient"] = gradient_text
         yield {"event": "complete", "data": json.dumps({
             "elapsed": round(elapsed, 1),
@@ -804,6 +915,8 @@ async def counterfactual_stream(sid: str, ticket: str, request: Request):
             "ranked": ranked_data,
             "results": results,
             "goal": goal if has_goal else None,
         })}
     return EventSourceResponse(event_generator(), ping=15)
@@ -995,6 +1108,30 @@ async def download_report(sid: str):
         lines.append(s["gradient"])
         lines.append("")
     # Bias audit
     if s.get("bias_audit"):
         audit = s["bias_audit"]

 sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
 from evaluate import evaluate_one, analyze as analyze_eval, SYSTEM_PROMPT, BIAS_CALIBRATION_ADDENDUM
 from counterfactual import probe_one, analyze_gradient, build_changes_block, compute_goal_weights
+from ctr_calibrate import sigmoid, fit_platt_scaling, predict_ctr, ctr_derivative
 from generate_cohort import generate_segment
 from bias_audit import (
     reframe_entity, add_authority_signals, reorder_entity,
     parallel: int = 5
+class CalibrationAnchor(BaseModel):
+    mean_score: float
+    metric_value: float
+class CalibrationInput(BaseModel):
+    metric_name: str = "conversion rate"
+    metric_unit: str = "%"
+    anchors: list[CalibrationAnchor]  # At least 1; first is "current entity"
 class SuggestSegmentsInput(BaseModel):
     entity_text: str
     audience_context: str
         "cohort": None,
         "eval_results": None,
         "gradient": None,
+        "gradient_ranked": None,
         "bias_audit": None,
+        "calibration": None,
         "created": datetime.now().isoformat(),
     }
     return {"session_id": sid}
     return {"ok": True}
+@app.post("/api/calibrate/{sid}")
+async def set_calibration(sid: str, cal: CalibrationInput):
+    """Set metric calibration for a session. Requires eval results."""
+    if sid not in sessions:
+        raise HTTPException(404, "Session not found")
+    session = sessions[sid]
+    if not session["eval_results"]:
+        raise HTTPException(400, "Run evaluation first")
+    anchors = [{"mean_score": a.mean_score, "metric_value": a.metric_value}
+               for a in cal.anchors if a.metric_value > 0]
+    if not anchors:
+        raise HTTPException(400, "Need at least one anchor with metric_value > 0")
+    if len(anchors) == 1:
+        # Single anchor: linear scaling. metric = k * mean_score
+        k = anchors[0]["metric_value"] / anchors[0]["mean_score"]
+        session["calibration"] = {
+            "metric_name": cal.metric_name,
+            "metric_unit": cal.metric_unit,
+            "method": "linear",
+            "k": k,
+            "anchors": anchors,
+        }
+    else:
+        # 2+ anchors: Platt scaling
+        platt_anchors = [{"mean_score": a["mean_score"], "real_ctr": a["metric_value"]}
+                         for a in anchors]
+        a, b = fit_platt_scaling(platt_anchors)
+        session["calibration"] = {
+            "metric_name": cal.metric_name,
+            "metric_unit": cal.metric_unit,
+            "method": "platt",
+            "a": a, "b": b,
+            "anchors": anchors,
+        }
+    # Re-calibrate existing gradient if available
+    result = _apply_calibration(session)
+    return {"ok": True, "calibration": session["calibration"], "calibrated_gradient": result}
+@app.delete("/api/calibrate/{sid}")
+async def clear_calibration(sid: str):
+    """Remove metric calibration from a session."""
+    if sid not in sessions:
+        raise HTTPException(404, "Session not found")
+    sessions[sid]["calibration"] = None
+    return {"ok": True}
+def _apply_calibration(session):
+    """Apply calibration to existing gradient data. Returns calibrated ranked list or None."""
+    cal = session.get("calibration")
+    ranked = session.get("gradient_ranked")
+    if not cal or not ranked:
+        return None
+    valid = [r for r in (session.get("eval_results") or []) if r and "score" in r]
+    if not valid:
+        return None
+    mean_score = sum(r["score"] for r in valid) / len(valid)
+    if cal["method"] == "linear":
+        k = cal["k"]
+        current_metric = k * mean_score
+        result = []
+        for r in ranked:
+            metric_delta = r["avg_delta"] * k
+            result.append({
+                "id": r["id"],
+                "label": r["label"],
+                "avg_delta": r["avg_delta"],
+                "metric_delta": round(metric_delta, 4),
+                "predicted_metric": round(current_metric + metric_delta, 4),
+            })
+        return {"current_metric": round(current_metric, 4), "items": result}
+    elif cal["method"] == "platt":
+        a, b = cal["a"], cal["b"]
+        current_metric = predict_ctr(a, b, mean_score)
+        deriv = ctr_derivative(a, b, mean_score)
+        result = []
+        for r in ranked:
+            metric_delta = r["avg_delta"] * deriv
+            result.append({
+                "id": r["id"],
+                "label": r["label"],
+                "avg_delta": r["avg_delta"],
+                "metric_delta": round(metric_delta, 4),
+                "predicted_metric": round(current_metric + metric_delta, 4),
+            })
+        return {"current_metric": round(current_metric, 4), "items": result}
+    return None
 @app.get("/api/session/{sid}")
 async def get_session(sid: str):
     if sid not in sessions:
         gradient_text, ranked_data = analyze_gradient(results, all_changes,
                                                       goal_weights=goal_weights)
         session["gradient"] = gradient_text
+        session["gradient_ranked"] = ranked_data
+        # Apply metric calibration if set
+        calibrated = _apply_calibration(session)
         yield {"event": "complete", "data": json.dumps({
             "elapsed": round(elapsed, 1),
             "ranked": ranked_data,
             "results": results,
             "goal": goal if has_goal else None,
+            "calibrated": calibrated,
+            "calibration": session.get("calibration"),
         })}
     return EventSourceResponse(event_generator(), ping=15)
         lines.append(s["gradient"])
         lines.append("")
+    # Metric calibration
+    if s.get("calibration"):
+        cal = s["calibration"]
+        lines.append("---\n")
+        lines.append(f"## Metric Calibration ({cal['metric_name']})\n")
+        lines.append(f"- **Method:** {cal['method']}")
+        lines.append(f"- **Unit:** {cal['metric_unit']}")
+        for anc in cal.get("anchors", []):
+            lines.append(f"- Anchor: score {anc['mean_score']:.1f} = {anc['metric_value']}{cal['metric_unit']}")
+        calibrated = _apply_calibration(s)
+        if calibrated:
+            lines.append(f"\n**Current predicted {cal['metric_name']}:** "
+                         f"{calibrated['current_metric']}{cal['metric_unit']}\n")
+            lines.append(f"| Change | Score Delta | {cal['metric_name']} Delta | Predicted |")
+            lines.append("|--------|-----------|-------------|-----------|")
+            for item in calibrated["items"]:
+                lines.append(
+                    f"| {item['label']} | {item['avg_delta']:+.1f} | "
+                    f"{item['metric_delta']:+.4f}{cal['metric_unit']} | "
+                    f"{item['predicted_metric']}{cal['metric_unit']} |"
+                )
+            lines.append("")
     # Bias audit
     if s.get("bias_audit"):
         audit = s["bias_audit"]

web/static/index.html CHANGED Viewed

@@ -456,6 +456,44 @@
         <summary style="cursor:pointer;color:var(--text2);font-size:0.9rem">Full analysis</summary>
         <div class="results-details" id="evalAnalysis"></div>
       </details>
       <div class="btn-row mt-16">
         <button onclick="runDirections()">Test what to change next</button>
         <button class="secondary" onclick="goToStep(3)">Check panel realism</button>
@@ -1241,7 +1279,8 @@ async function runDirections() {
           return;
         }
-        renderGradientTable(d.results, suggestedChanges, d.ranked);
         document.getElementById('gradientText').textContent = d.gradient;
         document.getElementById('changesTested').textContent =
           suggestedChanges.map(c => `${c.label}: ${c.description}`).join('\n');
@@ -1257,7 +1296,7 @@ async function runDirections() {
   }
 }
-function renderGradientTable(results, changes, ranked) {
   // Use backend-provided ranked data (respects goal weights / VJP) when available,
   // falling back to client-side aggregation only for legacy responses.
   if (!ranked || !ranked.length) {
@@ -1299,6 +1338,37 @@ function renderGradientTable(results, changes, ranked) {
     ranked.forEach(r => { if (!r.desc) r.desc = descs[r.id] || ''; });
   }
   const tbody = document.querySelector('#gradientTable tbody');
   tbody.innerHTML = '';
   ranked.forEach((r, i) => {
@@ -1308,6 +1378,25 @@ function renderGradientTable(results, changes, ranked) {
     const barColor = avg >= 0 ? 'var(--green)' : 'var(--red)';
     const rowId = `gradient-detail-${i}`;
     // Summary row (clickable)
     tbody.innerHTML += `
       <tr onclick="document.getElementById('${rowId}').classList.toggle('hidden')" style="cursor:pointer">
@@ -1320,9 +1409,7 @@ function renderGradientTable(results, changes, ranked) {
           ${avg >= 0 ? '+' : ''}${avg.toFixed(1)}
           <span class="delta-bar" style="width:${barWidth}px;background:${barColor};margin-left:8px"></span>
         </td>
-        <td style="color:var(--text2)">${r.min_delta >= 0 ? '+' : ''}${r.min_delta} to +${r.max_delta}</td>
-        <td style="color:var(--green)">${r.positive}</td>
-        <td style="color:var(--red)">${r.negative}</td>
       </tr>
     `;
@@ -1352,7 +1439,7 @@ function renderGradientTable(results, changes, ranked) {
     tbody.innerHTML += `
       <tr id="${rowId}" class="hidden">
-        <td colspan="6" style="padding:0;background:var(--bg);border-bottom:2px solid var(--border)">${detailHtml}</td>
       </tr>
     `;
   });
@@ -1459,6 +1546,81 @@ function runBiasAudit() {
   };
 }
 // ── Download report ──
 function downloadReport() {

         <summary style="cursor:pointer;color:var(--text2);font-size:0.9rem">Full analysis</summary>
         <div class="results-details" id="evalAnalysis"></div>
       </details>
+      <details class="mt-16">
+        <summary style="cursor:pointer;color:var(--text2);font-size:0.9rem">Anchor to a real metric (optional)</summary>
+        <div style="padding:12px 0">
+          <p style="font-size:0.8rem;color:var(--text2);margin-bottom:12px">
+            If you know the actual performance of this entity (e.g. CTR, conversion rate, revenue),
+            SGO can translate score changes into predicted metric changes.
+          </p>
+          <div style="display:flex;gap:10px;flex-wrap:wrap;align-items:flex-end">
+            <div class="field" style="flex:2;min-width:140px;margin-bottom:0">
+              <label>Metric name</label>
+              <input type="text" id="calMetricName" placeholder="e.g. CTR, conversion rate" value="CTR">
+            </div>
+            <div class="field" style="flex:1;min-width:80px;margin-bottom:0">
+              <label>Current value</label>
+              <input type="number" id="calMetricValue" step="any" placeholder="e.g. 2.1">
+            </div>
+            <div class="field" style="flex:1;min-width:60px;margin-bottom:0">
+              <label>Unit</label>
+              <input type="text" id="calMetricUnit" value="%" style="width:60px">
+            </div>
+            <button class="secondary" onclick="applyCalibration()" style="margin-bottom:0;white-space:nowrap">Apply</button>
+            <button class="secondary" onclick="clearCalibration()" id="calClearBtn" style="margin-bottom:0;display:none;padding:10px 12px;color:var(--red);border-color:var(--red)">Clear</button>
+          </div>
+          <div id="calStatus" class="hidden mt-12" style="font-size:0.85rem"></div>
+          <details id="calMultiAnchor" class="mt-12">
+            <summary style="cursor:pointer;color:var(--text2);font-size:0.8rem">Add more anchors for better calibration</summary>
+            <div style="padding:8px 0">
+              <p style="font-size:0.75rem;color:var(--text2);margin-bottom:8px">
+                With 2+ anchors (from other SGO runs with known metrics), calibration uses
+                Platt scaling instead of linear scaling for better accuracy.
+              </p>
+              <div id="extraAnchors"></div>
+              <button class="secondary" onclick="addAnchorRow()" style="padding:4px 12px;font-size:0.75rem">+ Add anchor</button>
+            </div>
+          </details>
+        </div>
+      </details>
       <div class="btn-row mt-16">
         <button onclick="runDirections()">Test what to change next</button>
         <button class="secondary" onclick="goToStep(3)">Check panel realism</button>
           return;
         }
+        if (d.calibration) currentCalibration = d.calibration;
+        renderGradientTable(d.results, suggestedChanges, d.ranked, d.calibrated);
         document.getElementById('gradientText').textContent = d.gradient;
         document.getElementById('changesTested').textContent =
           suggestedChanges.map(c => `${c.label}: ${c.description}`).join('\n');
   }
 }
+function renderGradientTable(results, changes, ranked, calibrated) {
   // Use backend-provided ranked data (respects goal weights / VJP) when available,
   // falling back to client-side aggregation only for legacy responses.
   if (!ranked || !ranked.length) {
     ranked.forEach(r => { if (!r.desc) r.desc = descs[r.id] || ''; });
   }
+  // Build calibration lookup if available
+  const calLookup = {};
+  const hasCal = calibrated && calibrated.items && calibrated.items.length > 0;
+  if (hasCal) {
+    calibrated.items.forEach(item => { calLookup[item.id] = item; });
+  }
+  // Update table header
+  const thead = document.querySelector('#gradientTable thead tr');
+  thead.innerHTML = hasCal
+    ? '<th>#</th><th>Change</th><th>Score Impact</th><th>Metric Impact</th><th>Predicted</th><th>Range</th>'
+    : '<th>#</th><th>Change</th><th>Avg Impact</th><th>Range</th><th>Helps</th><th>Hurts</th>';
+  // Show calibration summary above table
+  let calSummaryEl = document.getElementById('calSummary');
+  if (!calSummaryEl) {
+    calSummaryEl = document.createElement('div');
+    calSummaryEl.id = 'calSummary';
+    calSummaryEl.style.cssText = 'font-size:0.85rem;margin-bottom:12px';
+    document.getElementById('gradientTable').parentElement.insertBefore(
+      calSummaryEl, document.getElementById('gradientTable'));
+  }
+  if (hasCal && currentCalibration) {
+    const mn = currentCalibration.metric_name || 'metric';
+    const mu = currentCalibration.metric_unit || '';
+    calSummaryEl.innerHTML = `<span style="color:var(--accent2)">Calibrated to ${esc(mn)}</span> — current: <strong>${calibrated.current_metric}${esc(mu)}</strong>`;
+    calSummaryEl.classList.remove('hidden');
+  } else {
+    calSummaryEl.classList.add('hidden');
+  }
   const tbody = document.querySelector('#gradientTable tbody');
   tbody.innerHTML = '';
   ranked.forEach((r, i) => {
     const barColor = avg >= 0 ? 'var(--green)' : 'var(--red)';
     const rowId = `gradient-detail-${i}`;
+    const calItem = calLookup[r.id];
+    let calCols = '';
+    if (hasCal && calItem) {
+      const mu = (currentCalibration && currentCalibration.metric_unit) || '';
+      const md = calItem.metric_delta;
+      const mdCls = md >= 0 ? 'delta-pos' : 'delta-neg';
+      calCols = `
+        <td class="${mdCls}">${md >= 0 ? '+' : ''}${formatMetric(md, mu)}</td>
+        <td style="font-weight:600">${formatMetric(calItem.predicted_metric, mu)}</td>
+      `;
+    } else if (hasCal) {
+      calCols = '<td>—</td><td>—</td>';
+    }
+    const rangeCols = hasCal ? '' : `
+        <td style="color:var(--text2)">${r.min_delta >= 0 ? '+' : ''}${r.min_delta} to +${r.max_delta}</td>
+        <td style="color:var(--green)">${r.positive}</td>
+        <td style="color:var(--red)">${r.negative}</td>`;
     // Summary row (clickable)
     tbody.innerHTML += `
       <tr onclick="document.getElementById('${rowId}').classList.toggle('hidden')" style="cursor:pointer">
           ${avg >= 0 ? '+' : ''}${avg.toFixed(1)}
           <span class="delta-bar" style="width:${barWidth}px;background:${barColor};margin-left:8px"></span>
         </td>
+        ${calCols}${rangeCols}
       </tr>
     `;
     tbody.innerHTML += `
       <tr id="${rowId}" class="hidden">
+        <td colspan="${hasCal ? 6 : 6}" style="padding:0;background:var(--bg);border-bottom:2px solid var(--border)">${detailHtml}</td>
       </tr>
     `;
   });
   };
 }
+// ── Metric Calibration ──
+let currentCalibration = null;
+function formatMetric(value, unit) {
+  if (unit === '%') return value.toFixed(2) + '%';
+  if (unit === '$') return '$' + value.toFixed(2);
+  return value.toFixed(4) + (unit ? ' ' + unit : '');
+}
+function addAnchorRow() {
+  const container = document.getElementById('extraAnchors');
+  const idx = container.children.length;
+  const row = document.createElement('div');
+  row.style.cssText = 'display:flex;gap:8px;align-items:center;margin-bottom:6px';
+  row.innerHTML = `
+    <input type="number" step="any" placeholder="Mean score" style="flex:1;padding:6px;font-size:0.8rem" class="anchor-score">
+    <span style="font-size:0.8rem;color:var(--text2)">=</span>
+    <input type="number" step="any" placeholder="Metric value" style="flex:1;padding:6px;font-size:0.8rem" class="anchor-value">
+    <button class="secondary" onclick="this.parentElement.remove()" style="padding:4px 8px;font-size:0.75rem">x</button>
+  `;
+  container.appendChild(row);
+}
+async function applyCalibration() {
+  if (!sessionId) return alert('Run evaluation first.');
+  const metricName = document.getElementById('calMetricName').value.trim() || 'metric';
+  const metricValue = parseFloat(document.getElementById('calMetricValue').value);
+  const metricUnit = document.getElementById('calMetricUnit').value.trim() || '';
+  if (!metricValue || metricValue <= 0) return alert('Enter a positive metric value.');
+  // Get the current mean score from eval results
+  const valid = (evalResultsData || []).filter(r => r && r.score);
+  if (!valid.length) return alert('No evaluation data.');
+  const meanScore = valid.reduce((s, r) => s + r.score, 0) / valid.length;
+  // Build anchors: current entity + any extra
+  const anchors = [{mean_score: meanScore, metric_value: metricValue}];
+  document.querySelectorAll('#extraAnchors > div').forEach(row => {
+    const score = parseFloat(row.querySelector('.anchor-score').value);
+    const value = parseFloat(row.querySelector('.anchor-value').value);
+    if (score > 0 && value > 0) anchors.push({mean_score: score, metric_value: value});
+  });
+  try {
+    const resp = await fetch(`/api/calibrate/${sessionId}`, {
+      method: 'POST',
+      headers: llmHeaders(),
+      body: JSON.stringify({metric_name: metricName, metric_unit: metricUnit, anchors}),
+    });
+    const data = await resp.json();
+    if (!resp.ok) throw new Error(data.detail || 'Calibration failed');
+    currentCalibration = data.calibration;
+    const status = document.getElementById('calStatus');
+    const method = anchors.length === 1 ? 'linear scaling' : 'Platt scaling';
+    status.innerHTML = `<span style="color:var(--green)">Calibrated (${esc(method)})</span> — gradient will show ${esc(metricName)} deltas`;
+    status.classList.remove('hidden');
+    document.getElementById('calClearBtn').style.display = '';
+  } catch (e) {
+    const status = document.getElementById('calStatus');
+    status.innerHTML = `<span style="color:var(--red)">Error: ${esc(e.message)}</span>`;
+    status.classList.remove('hidden');
+  }
+}
+async function clearCalibration() {
+  if (!sessionId) return;
+  await fetch(`/api/calibrate/${sessionId}`, {method: 'DELETE', headers: llmHeaders()});
+  currentCalibration = null;
+  document.getElementById('calStatus').classList.add('hidden');
+  document.getElementById('calClearBtn').style.display = 'none';
+}
 // ── Download report ──
 function downloadReport() {