claw-web-v2

Sleeping

File size: 53,873 Bytes

/**
 * Claw Agent Runtime — the core agentic conversation loop.
 * Handles streaming responses, tool calls, and multi-turn conversations.
 */

import { ENV } from "../_core/env";
import { buildSystemPrompt, TOOL_DEFINITIONS } from "./system-prompt";
import { executeTool, getPlanMode, runPreToolHooks, runPostToolHooks, initializeMcpFromConfig, getMcpManager } from "../tools/executor";
import { compactSession, compactSessionWithLLM, shouldCompact, estimateSessionTokens, dbMessagesToSession, DEFAULT_COMPACTION_CONFIG } from "./compact";
import type { Session, ConversationMessage as CompactMessage, CompactionConfig } from "./compact";
import { UsageTracker, pricingForModel, defaultSonnetTierPricing, estimateCostUsdWithPricing, totalCostUsd, formatUsd, summaryLinesForModel } from "./usage";
import type { TokenUsage } from "./usage";
import type { Response } from "express";
import { execSync } from "child_process";

// In original claw-code, max_iterations defaults to usize::MAX (effectively unlimited).
// Auto-compact is triggered on context overflow (400 error) — matches original compact() method.

// Context window sizes for known models (used for proactive compaction)
const MODEL_CONTEXT_WINDOWS: Record<string, number> = {
  // Xiaomi MiMo
  "XiaomiMiMo/MiMo-V2-Flash": 262144,
  // Qwen models (DeepInfra + HuggingFace)
  "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo": 262144,
  "Qwen/Qwen3-Coder-480B-A35B-Instruct": 262144,
  "Qwen/Qwen3-235B-A22B-Instruct-2507": 262144,
  "Qwen/Qwen3-235B-A22B-Thinking-2507": 262144,
  "Qwen/Qwen3.5-397B-A17B": 262144,
  "Qwen/Qwen3.5-122B-A10B": 262144,
  "Qwen/Qwen3-Coder-Next": 131072,
  "Qwen/Qwen3-32B": 40960,
  "Qwen/Qwen3-8B": 32768,
  "Qwen/Qwen3-Coder-30B-A3B-Instruct": 131072,
  // Meta Llama
  "meta-llama/Llama-3.3-70B-Instruct": 131072,
  "meta-llama/Llama-4-Maverick-17B-128E": 1048576,
  "meta-llama/Llama-4-Scout-17B-16E": 327680,
  // DeepSeek
  "deepseek-ai/DeepSeek-V3.2": 163840,
  "deepseek-ai/DeepSeek-V3.1": 163840,
  "deepseek-ai/DeepSeek-R1": 131072,
  "deepseek-ai/DeepSeek-R1-0528": 163840,
  // NVIDIA Nemotron
  "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B": 262144,
  // StepFun
  "stepfun-ai/Step-3.5-Flash": 262144,
  // NousResearch (uncensored)
  "NousResearch/Hermes-3-Llama-3.1-70B": 131072,
  "NousResearch/Hermes-3-Llama-3.1-405B": 131072,
  // Anthropic
  "claude-opus-4-6": 200000,
  "claude-sonnet-4-6": 200000,
  // OpenAI
  "gpt-5.4": 1048576,
  "gpt-4.1": 1048576,
  // xAI
  "grok-3": 131072,
  // Google
  "google/gemini-2.5-flash": 1000000,
  "google/gemini-2.5-pro": 1000000,
};

const DEFAULT_CONTEXT_WINDOW = 131072;

/**
 * Convert agent messages to compact.ts Session format for compaction.
 */
function agentMessagesToSession(messages: AgentMessage[]): Session {
  return dbMessagesToSession(
    messages.map((m) => ({
      role: m.role,
      content: m.content || "",
      toolName: m.name || null,
      toolCallId: m.tool_call_id || null,
    }))
  );
}

/**
 * Convert compacted Session back to AgentMessage[] format.
 */
function sessionToAgentMessages(session: Session): AgentMessage[] {
  return session.messages.map((msg) => {
    const agentMsg: AgentMessage = {
      role: msg.role,
      content: msg.blocks
        .filter((b) => b.type === "text")
        .map((b) => b.text || "")
        .join("\n") || null,
    };
    // Reconstruct tool_calls from tool_use blocks
    const toolUseBlocks = msg.blocks.filter((b) => b.type === "tool_use");
    if (toolUseBlocks.length > 0) {
      agentMsg.tool_calls = toolUseBlocks.map((b, i) => ({
        id: `compacted_${i}_${Date.now()}`,
        type: "function" as const,
        function: {
          name: b.name || "unknown",
          arguments: b.input || "{}",
        },
      }));
    }
    // Reconstruct tool result fields
    const toolResultBlock = msg.blocks.find((b) => b.type === "tool_result");
    if (toolResultBlock) {
      agentMsg.name = toolResultBlock.toolName;
      agentMsg.content = toolResultBlock.output || "";
    }
    return agentMsg;
  });
}

/**
 * Estimate total tokens in the conversation (simple heuristic: ~4 chars per token).
 */
function estimateConversationTokens(messages: AgentMessage[]): number {
  let total = 0;
  for (const msg of messages) {
    total += Math.ceil((msg.content?.length || 0) / 4) + 4; // +4 for role/overhead
    if (msg.tool_calls) {
      for (const tc of msg.tool_calls) {
        total += Math.ceil((tc.function.name.length + tc.function.arguments.length) / 4) + 4;
      }
    }
  }
  return total;
}

interface AgentMessage {
  role: "system" | "user" | "assistant" | "tool";
  content: string | null;
  tool_calls?: Array<{
    id: string;
    type: "function";
    function: { name: string; arguments: string };
  }>;
  tool_call_id?: string;
  name?: string;
}

interface AgentConfig {
  model: string;
  apiProvider: string;
  apiKey?: string | null;
  apiBaseUrl?: string | null;
  maxTokens: number;
  temperature: number;
  topP: number;
  systemPrompt?: string | null;
  memory?: string | null;
  workDir?: string;
  effortLevel?: "low" | "medium" | "high";
  maxIterations?: number;
}

/**
 * TurnSummary — matches original conversation.rs TurnSummary struct.
 * Returned after each complete agent turn.
 */
export interface TurnSummary {
  assistantMessages: AgentMessage[];
  toolResults: AgentMessage[];
  iterations: number;
  usage: TokenUsage;
}

/**
 * Read git status (matches original read_git_status from prompt.rs)
 */
function readGitStatus(cwd: string): string | null {
  try {
    const output = execSync("git --no-optional-locks status --short --branch", {
      cwd,
      timeout: 5000,
      encoding: "utf-8",
      stdio: ["pipe", "pipe", "pipe"],
    }).trim();
    return output || null;
  } catch {
    return null;
  }
}

/**
 * Read git diff (matches original read_git_diff from prompt.rs)
 */
function readGitDiff(cwd: string): string | null {
  try {
    const sections: string[] = [];
    try {
      const staged = execSync("git diff --cached", {
        cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"],
      }).trim();
      if (staged) sections.push(`Staged changes:\n${staged}`);
    } catch {}
    try {
      const unstaged = execSync("git diff", {
        cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"],
      }).trim();
      if (unstaged) sections.push(`Unstaged changes:\n${unstaged}`);
    } catch {}
    return sections.length > 0 ? sections.join("\n\n") : null;
  } catch {
    return null;
  }
}

/**
 * Merge hook feedback into tool output — matches original merge_hook_feedback()
 */
function mergeHookFeedback(hookMessages: string[], output: string, denied: boolean): string {
  if (hookMessages.length === 0) return output;
  const sections: string[] = [];
  if (output.trim()) sections.push(output);
  const label = denied ? "Hook feedback (denied)" : "Hook feedback";
  sections.push(`${label}:\n${hookMessages.join("\n")}`);
  return sections.join("\n\n");
}

const DEFAULT_CONFIG: AgentConfig = {
  model: process.env.DEFAULT_MODEL || "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
  apiProvider: "deepinfra",
  maxTokens: 32768,       // Qwen3-Coder supports up to 65k output
  temperature: 0.5,       // Lower temp = more focused/deterministic agent behavior
  topP: 0.95,             // Slightly restricted for more coherent tool calls
  workDir: process.env.WORKSPACE_DIR || "/home/ubuntu",
  effortLevel: "high",
};

/**
 * Retry config for transient API errors.
 * - 429 (rate limit): retry INFINITELY every 2 seconds until it works.
 * - 500/502/503 (server errors): retry INFINITELY every 2 seconds.
 * - Network errors: retry INFINITELY every 2 seconds.
 * We NEVER give up on transient errors — just keep trying.
 */
const RETRY_DELAY_MS = 2000; // fixed 2 second interval — simple and reliable

/**
 * Resolve the API URL and key based on provider config
 */
function resolveApiConfig(config: AgentConfig) {
  // ─── HARDCODED FALLBACK — always works even if settings are corrupted ───
  const FALLBACK_URL = "https://api.deepinfra.com/v1/openai";
  const FALLBACK_MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo";

  // Resolve model aliases (used for both default and custom paths)
  const aliasMap: Record<string, string> = {
    // Xiaomi MiMo
    mimo: "XiaomiMiMo/MiMo-V2-Flash",
    "mimo-flash": "XiaomiMiMo/MiMo-V2-Flash",
    "mimo-v2": "XiaomiMiMo/MiMo-V2-Flash",
    // Qwen models (DeepInfra)
    "qwen-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
    "qwen-coder-turbo": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
    "qwen-coder-480b": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
    "qwen3-235b": "Qwen/Qwen3-235B-A22B-Instruct-2507",
    "qwen3-thinking": "Qwen/Qwen3-235B-A22B-Thinking-2507",
    "qwen3.5": "Qwen/Qwen3.5-397B-A17B",
    "qwen3-32b": "Qwen/Qwen3-32B",
    "qwen3-8b": "Qwen/Qwen3-8B",
    "qwen3-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
    // Llama
    llama: "meta-llama/Llama-3.3-70B-Instruct",
    "llama-70b": "meta-llama/Llama-3.3-70B-Instruct",
    "llama-4": "meta-llama/Llama-4-Maverick-17B-128E",
    // DeepSeek
    deepseek: "deepseek-ai/DeepSeek-V3.2",
    "deepseek-r1": "deepseek-ai/DeepSeek-R1-0528",
    "deepseek-v3": "deepseek-ai/DeepSeek-V3.2",
    // NVIDIA
    nemotron: "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B",
    // StepFun
    step: "stepfun-ai/Step-3.5-Flash",
    "step-flash": "stepfun-ai/Step-3.5-Flash",
    // Uncensored
    hermes: "NousResearch/Hermes-3-Llama-3.1-70B",
    "hermes-405b": "NousResearch/Hermes-3-Llama-3.1-405B",
    uncensored: "NousResearch/Hermes-3-Llama-3.1-70B",
    // OpenAI GPT-5.x family
    "gpt5": "gpt-5.4",
    "gpt-5": "gpt-5.4",
    "gpt54": "gpt-5.4",
    // Anthropic aliases
    opus: "claude-opus-4-6",
    sonnet: "claude-sonnet-4-6",
    haiku: "claude-haiku-4-5-20251213",
    // xAI
    grok: "grok-3",
    "grok-3": "grok-3",
    // Google
    gemini: "google/gemini-2.5-flash",
    "gemini-pro": "google/gemini-2.5-pro",
  };

  // Treat empty, null, masked, or built-in providers as "use server default"
  const hasCustomKey = config.apiKey && config.apiKey.length > 4 && !config.apiKey.startsWith("••••");
  if (config.apiProvider === "claw" || config.apiProvider === "default" || config.apiProvider === "huggingface" || config.apiProvider === "deepinfra" || !hasCustomKey) {
    const defaultModel = process.env.DEFAULT_MODEL || FALLBACK_MODEL;
    const resolvedModel = aliasMap[config.model] || config.model || defaultModel;
    // Use BUILT_IN_FORGE_API_URL from env — HuggingFace router or OpenAI
    const baseUrl = (ENV.forgeApiUrl || FALLBACK_URL).replace(/\/$/, "");
    const apiKey = ENV.forgeApiKey || process.env.BUILT_IN_FORGE_API_KEY || "";
    console.log(`[agent] resolveApiConfig: using server default. URL=${baseUrl}, model=${resolvedModel}, hasKey=${!!apiKey}`);
    return {
      url: `${baseUrl}/chat/completions`,
      key: apiKey,
      model: resolvedModel || FALLBACK_MODEL,
    };
  }

  // Custom provider path — user has their own API key
  let baseUrl = config.apiBaseUrl || "";
  if (!baseUrl) {
    const providers: Record<string, string> = {
      deepinfra: "https://api.deepinfra.com/v1/openai",
      huggingface: "https://router.huggingface.co/v1",
      xai: "https://api.x.ai/v1",
      openrouter: "https://openrouter.ai/api/v1",
      openai: "https://api.openai.com/v1",
      anthropic: "https://api.anthropic.com/v1",
      groq: "https://api.groq.com/openai/v1",
      cerebras: "https://api.cerebras.ai/v1",
      ollama: "http://localhost:11434/v1",
    };
    baseUrl = providers[config.apiProvider] || FALLBACK_URL;
  }

  const resolvedModel = aliasMap[config.model] || config.model || FALLBACK_MODEL;
  console.log(`[agent] resolveApiConfig: custom provider. URL=${baseUrl}, model=${resolvedModel}`);
  return {
    url: `${baseUrl.replace(/\/$/, "")}/chat/completions`,
    key: config.apiKey,
    model: resolvedModel,
  };
}

/**
 * Send an SSE event to the client
 */
function sendSSE(res: Response, event: string, data: unknown) {
  try {
    res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
  } catch {
    // Connection may be closed
  }
}

/**
 * Run the agentic loop: send messages to LLM, execute tool calls, repeat.
 * This is the core of the agent — it loops until the LLM stops calling tools.
 */
export async function runAgentLoop(
  messages: AgentMessage[],
  sessionId: number,
  config: Partial<AgentConfig>,
  res: Response,
  signal?: AbortSignal
): Promise<{
  finalMessages: AgentMessage[];
  totalPromptTokens: number;
  totalCompletionTokens: number;
  totalCost: number;
  model: string;
}> {
  const cfg = { ...DEFAULT_CONFIG, ...config };
  const apiConfig = resolveApiConfig(cfg);
  const workDir = cfg.workDir || "/home/ubuntu";

  // Get plan mode state
  const planState = getPlanMode(sessionId);

  // Read git status and diff (matches original ProjectContext::discover_with_git)
  const gitStatus = readGitStatus(workDir);
  const gitDiff = readGitDiff(workDir);

  // Build system prompt with full environment context
  const systemPrompt = buildSystemPrompt({
    memory: cfg.memory,
    effortLevel: cfg.effortLevel || "high",
    planMode: planState.active,
    planSteps: planState.steps,
    customSystemPrompt: cfg.systemPrompt,
    workDir,
    platform: "linux",
    model: apiConfig.model,
    gitStatus,
    gitDiff,
  });

  // Initialize UsageTracker (matches original conversation.rs)
  const usageTracker = UsageTracker.new();

  // Build conversation with system message first
  const conversationMessages: AgentMessage[] = [
    { role: "system", content: systemPrompt },
    ...messages.filter((m) => m.role !== "system"),
  ];

  let totalPromptTokens = 0;
  let totalCompletionTokens = 0;
  let totalCost = 0;
  let iterations = 0;
  let emptyResponseRetries = 0;
  const MAX_EMPTY_RETRIES = 3;
  // Safety limit: prevent infinite loops. Original claw-code uses usize::MAX but that
  // causes runaway loops with Qwen3 which sometimes fails to stop generating.
  // 200 iterations is more than enough for any real task.
  const MAX_ITERATIONS = cfg.maxIterations || 200;
  const assistantMessages: AgentMessage[] = [];
  const toolResultMessages: AgentMessage[] = [];

  // ─── Loop detection: minimal safety net ─────────────────────────────
  // Only detect EXACT same tool+args repeated 5+ times (true infinite loop).
  // Everything else is handled by MAX_ITERATIONS.
  const recentToolSignatures: string[] = [];
  const MAX_EXACT_REPEATS = 5;

  // ─── MCP Tools Dynamic Injection (matches original claw-code) ──────────
  // Initialize MCP servers from config and merge discovered tools with static TOOL_DEFINITIONS.
  // This is how the original claw-code dynamically builds the tool list:
  //   1. Load MCP server configs from .claw/settings.json
  //   2. Connect to each server via stdio JSON-RPC
  //   3. Call tools/list to discover available tools
  //   4. Prefix tool names as mcp__servername__toolname
  //   5. Merge with static tool definitions
  let allTools = [...TOOL_DEFINITIONS];
  try {
    const mcpTools = await initializeMcpFromConfig(workDir);
    if (mcpTools.length > 0) {
      const mcpManager = getMcpManager();
      if (mcpManager) {
        const mcpDefs = mcpManager.getToolDefinitions();
        // Convert MCP tool format to OpenAI function calling format
        const mcpToolDefs = mcpDefs.map((t) => ({
          type: "function" as const,
          function: {
            name: t.name,
            description: t.description,
            parameters: t.input_schema || { type: "object", properties: {} },
          },
        }));
        allTools = [...TOOL_DEFINITIONS, ...mcpToolDefs];
        console.log(`[agent] MCP tools injected: ${mcpDefs.map((t) => t.name).join(", ")}`);
        sendSSE(res, "status", {
          status: "mcp_ready",
          message: `MCP tools loaded: ${mcpDefs.length} tools from ${mcpManager.getConnectedServers().length} servers`,
        });
      }
    }
  } catch (err: any) {
    console.error(`[agent] MCP initialization error (non-fatal):`, err.message);
    // MCP init failure is non-fatal — agent continues with static tools only
  }

  // ─── Context-aware compaction config ──────────────────────────────
  // Original claw-code uses percentage-based thresholds, not fixed 10k tokens.
  // We compute the threshold as 70% of the model's context window.
  const contextWindow = MODEL_CONTEXT_WINDOWS[apiConfig.model] || DEFAULT_CONTEXT_WINDOW;
  const dynamicCompactionConfig: import("./compact").CompactionConfig = {
    preserveRecentMessages: DEFAULT_COMPACTION_CONFIG.preserveRecentMessages,
    maxEstimatedTokens: Math.floor(contextWindow * 0.7),
  };

  sendSSE(res, "status", { status: "thinking", message: "Processing your request..." });

  while (iterations < MAX_ITERATIONS) {
    iterations++;

    if (signal?.aborted) {
      sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" });
      break;
    }

    // Build API request
    // Determine max_tokens limit based on provider
    const isDeepInfra = apiConfig.url.includes("deepinfra.com");
    const isHuggingFace = apiConfig.url.includes("huggingface.co");
    const maxTokensLimit = isHuggingFace ? 32000 : (isDeepInfra ? 65536 : 65536);

    // Detect if model supports thinking/reasoning mode (Qwen3 Thinking, DeepSeek-R1)
    const isThinkingModel = apiConfig.model.includes("Thinking") || apiConfig.model.includes("R1");

    const payload: Record<string, unknown> = {
      model: apiConfig.model,
      messages: conversationMessages.map((m) => {
        const msg: Record<string, unknown> = { role: m.role, content: m.content };
        if (m.tool_calls) msg.tool_calls = m.tool_calls;
        if (m.tool_call_id) msg.tool_call_id = m.tool_call_id;
        if (m.name) msg.name = m.name;
        return msg;
      }),
      tools: allTools,
      // Match original: always "auto" — let the model decide what tools to use
      tool_choice: "auto",
      max_tokens: Math.min(cfg.maxTokens, maxTokensLimit),
      temperature: isThinkingModel ? 0.6 : cfg.temperature, // thinking models need lower temp
      top_p: cfg.topP,
      stream: true,
    };

    // Enable thinking/reasoning for supported models (Qwen3 Thinking, DeepSeek-R1)
    if (isThinkingModel && isDeepInfra) {
      (payload as any).extra_body = { enable_thinking: true };
    }

    // Qwen3-Coder: hybrid thinking mode — model decides when to think
    // DO NOT disable thinking; Qwen3 automatically uses <think> blocks for complex tasks
    // The /no_think in system prompt is only a soft hint for simple responses,
    // the model will override it when it needs to reason through tool calls or code

    // NOTE: Prefill technique (empty assistant message) was removed because it caused
    // the model to "talk to itself" — seeing an empty assistant message made it think
    // it already responded and continue generating follow-up messages in a loop.
    // Anti-censorship is handled purely via system prompt directives instead.

    sendSSE(res, "message_start", { iteration: iterations });

    try {
      // ─── Retry loop with INFINITE retries for 429, limited for 5xx ─────
      // 429 = rate limit: the API is telling us to slow down, NOT to give up.
      // We MUST keep retrying with increasing backoff until it works.
      // 500/502/503 = server errors: retry up to MAX_SERVER_RETRIES times.
      let response: globalThis.Response | null = null;
      let lastErrorText = "";
      let lastStatus = 0;
      let serverRetryCount = 0;
      let rateLimitRetryCount = 0;

      while (true) {
        if (signal?.aborted) break;

        try {
          response = await fetch(apiConfig.url, {
            method: "POST",
            headers: {
              "Content-Type": "application/json",
              Authorization: `Bearer ${apiConfig.key}`,
            },
            body: JSON.stringify(payload),
            signal,
          });
        } catch (fetchErr: any) {
          // Network error (DNS, connection refused, etc.) — retry every 2s
          if (signal?.aborted) break;
          serverRetryCount++;
          console.error(`[agent] Fetch error (retry #${serverRetryCount}):`, fetchErr.message);
          sendSSE(res, "status", {
            status: "retrying",
            message: `Network error, retrying in 2s... (attempt #${serverRetryCount})`,
          });
          await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
          continue;
        }

        if (response.ok) break;

        lastStatus = response.status;
        lastErrorText = await response.text();

        // ─── 429 Rate Limit: INFINITE retry every 2s ───
        if (response.status === 429) {
          rateLimitRetryCount++;
          console.log(`[agent] Rate limited (429) — retry #${rateLimitRetryCount} in 2s`);
          sendSSE(res, "status", {
            status: "rate_limited",
            message: `Rate limited by API — retrying in 2s... (attempt #${rateLimitRetryCount})`,
          });
          await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
          response = null;
          continue; // NEVER give up on 429
        }

        // ─── 500/502/503 Server errors: INFINITE retry every 2s ───
        if ([500, 502, 503].includes(response.status)) {
          serverRetryCount++;
          console.log(`[agent] Server error ${response.status} — retry #${serverRetryCount} in 2s`);
          sendSSE(res, "status", {
            status: "retrying",
            message: `Server error ${response.status}, retrying in 2s... (attempt #${serverRetryCount})`,
          });
          await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
          response = null;
          continue;
        }

        // Any other error (400, 401, 403, 404, etc.) — don't retry
        break;
      }

      if (!response || !response.ok) {
        console.error(`[agent] API error ${lastStatus}:`, lastErrorText);
        console.error(`[agent] Payload model:`, apiConfig.model);
        console.error(`[agent] Payload messages count:`, (payload.messages as any[]).length);

        // ─── AUTO-COMPACT on context overflow (400 error) ─────────────
        if (lastStatus === 400 && (lastErrorText.includes("context_length") || lastErrorText.includes("too many tokens") || lastErrorText.includes("maximum context") || lastErrorText.includes("token limit") || lastErrorText.includes("too long"))) {
          console.log(`[agent] Context overflow detected — auto-compacting conversation...`);
          sendSSE(res, "status", {
            status: "compacting",
            message: "Context window exceeded — auto-compacting conversation...",
          });

          try {
            const session = agentMessagesToSession(conversationMessages);

            // LLM-based summarization: use the same API to produce a real summary
            const llmFetch = async (msgs: Array<{role: string; content: string}>) => {
              const summaryResp = await fetch(apiConfig.url, {
                method: "POST",
                headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` },
                body: JSON.stringify({
                  model: apiConfig.model,
                  messages: msgs,
                  max_tokens: 2000,
                  temperature: 0.3,
                  stream: false,
                }),
              });
              if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`);
              const json = await summaryResp.json();
              return json.choices?.[0]?.message?.content || "";
            };

            const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetch);

            if (compactResult.removedMessageCount > 0) {
              const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession);
              conversationMessages.length = 0;
              conversationMessages.push({ role: "system", content: systemPrompt });
              conversationMessages.push(...compactedAgentMessages);

              console.log(`[agent] Auto-compact (LLM): removed ${compactResult.removedMessageCount} messages, kept ${conversationMessages.length}`);
              sendSSE(res, "auto_compact", {
                removedCount: compactResult.removedMessageCount,
                keptCount: conversationMessages.length,
                summary: compactResult.formattedSummary,
              });
              continue; // retry with compacted context
            } else {
              console.error(`[agent] Auto-compact produced no reduction — breaking`);
              sendSSE(res, "error", {
                message: `Context overflow but compaction couldn't reduce further`,
                details: lastErrorText,
              });
              break;
            }
          } catch (compactErr: any) {
            console.error(`[agent] Auto-compact failed:`, compactErr.message);
            sendSSE(res, "error", {
              message: `Context overflow — auto-compact failed: ${compactErr.message}`,
              details: lastErrorText,
            });
            break;
          }
        }

        // Non-context 400 errors — log details for debugging
        if (lastStatus === 400) {
          console.error(`[agent] Full error body:`, lastErrorText);
          (payload.messages as any[]).forEach((m: any, i: number) => {
            console.error(`[agent] msg[${i}] role=${m.role} content_type=${typeof m.content} content_len=${String(m.content || '').length} has_tool_calls=${!!m.tool_calls} has_tool_call_id=${!!m.tool_call_id}`);
          });
        }
        sendSSE(res, "error", {
          message: `API error: ${lastStatus}${lastStatus === 429 ? ' (rate limit)' : ''} — ${lastErrorText.substring(0, 200)}`,
          details: lastErrorText,
        });
        break;
      }

      // Process streaming response
      let result: { content: string; toolCalls: Array<{ id: string; type: "function"; function: { name: string; arguments: string } }>; usage?: any };
      try {
        result = await processStream(response, res, signal);
      } catch (streamErr: any) {
        // Stream processing error — treat as transient, retry
        console.error(`[agent] Stream processing error:`, streamErr.message);
        if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) {
          sendSSE(res, "status", { status: "retrying", message: `Stream error, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` });
          await new Promise(r => setTimeout(r, 1500));
          continue;
        }
        sendSSE(res, "error", { message: `Stream failed after ${MAX_EMPTY_RETRIES} retries: ${streamErr.message}` });
        break;
      }

      // ─── Bug #1 fix: Handle empty LLM response with retry ─────────
      // Original claw-code retries on empty response instead of crashing.
      // Open-source models via HuggingFace often return empty streams.
      if (!result.content && result.toolCalls.length === 0) {
        if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) {
          console.warn(`[agent] Empty response from LLM — retry ${emptyResponseRetries}/${MAX_EMPTY_RETRIES}`);
          sendSSE(res, "status", { status: "retrying", message: `Empty response from model, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` });
          await new Promise(r => setTimeout(r, 1500));
          continue; // retry same iteration
        }
        console.error(`[agent] LLM returned empty response ${MAX_EMPTY_RETRIES} times — giving up`);
        sendSSE(res, "error", { message: `Model returned empty response after ${MAX_EMPTY_RETRIES} retries. Try a different model or reduce context.` });
        break;
      }
      emptyResponseRetries = 0; // reset on successful response

      // Track usage with UsageTracker (matches original)
      if (result.usage) {
        totalPromptTokens += result.usage.prompt_tokens || 0;
        totalCompletionTokens += result.usage.completion_tokens || 0;
        usageTracker.record({
          input_tokens: result.usage.prompt_tokens || 0,
          output_tokens: result.usage.completion_tokens || 0,
          cache_creation_input_tokens: result.usage.cache_creation_input_tokens || 0,
          cache_read_input_tokens: result.usage.cache_read_input_tokens || 0,
        });
      }

      // Add assistant message to conversation
      const assistantMessage: AgentMessage = {
        role: "assistant",
        // Match original: null when no content (even with tool_calls)
        content: result.content || null,
      };
      if (result.toolCalls && result.toolCalls.length > 0) {
        assistantMessage.tool_calls = result.toolCalls;
      }
      conversationMessages.push(assistantMessage);
      assistantMessages.push(assistantMessage);

      // If no tool calls, we're done — the LLM has finished responding.
      // This matches the original claw-code behavior exactly:
      // the model decides when to stop by not calling tools.
      if (!result.toolCalls || result.toolCalls.length === 0) {
        sendSSE(res, "message_end", {
          promptTokens: totalPromptTokens,
          completionTokens: totalCompletionTokens,
          cost: totalCost,
          model: apiConfig.model,
        });
        break;
      }

      // ─── Minimal loop detection: only catch TRUE infinite loops ───────
      // Only break if the EXACT same tool+args is repeated 5+ times.
      // This is the only safety net beyond MAX_ITERATIONS.
      // The original claw-code has NO loop detection at all — it trusts the model.
      const currentToolSig = result.toolCalls.map((tc: any) => `${tc.function.name}:${tc.function.arguments}`).join("|");
      recentToolSignatures.push(currentToolSig);
      if (recentToolSignatures.length > MAX_EXACT_REPEATS) {
        recentToolSignatures.shift();
      }
      if (recentToolSignatures.length >= MAX_EXACT_REPEATS) {
        const allSame = recentToolSignatures.every(r => r === recentToolSignatures[0]);
        if (allSame) {
          console.warn(`[agent] Infinite loop detected: exact same tool call repeated ${MAX_EXACT_REPEATS} times — breaking`);
          sendSSE(res, "error", {
            message: `⚠️ обнаружен бесконечный цикл. попробуй переформулировать запрос`,
          });
          sendSSE(res, "message_end", {
            promptTokens: totalPromptTokens,
            completionTokens: totalCompletionTokens,
            cost: totalCost,
            model: apiConfig.model,
          });
          break;
        }
      }

      // ─── Execute tool calls ──────────────────────────────────────────
      // Bug #2+#3 fix: Each tool call is wrapped in its own try-catch.
      // Original claw-code sends tool errors back to LLM as tool results,
      // letting the model decide how to handle them. We NEVER break the
      // loop on a tool error — only on fatal API/stream errors.
      for (const toolCall of result.toolCalls) {
        const toolName = toolCall.function.name;
        let toolArgs: Record<string, unknown> = {};
        let argParseError = false;
        try {
          toolArgs = JSON.parse(toolCall.function.arguments || "{}");
        } catch (parseErr: any) {
          // Try JSON repair before giving up
          try {
            const { jsonrepair } = await import("jsonrepair");
            const repaired = jsonrepair(toolCall.function.arguments || "{}");
            toolArgs = JSON.parse(repaired);
            console.info(`[agent] Repaired malformed JSON for ${toolName}`);
          } catch (repairErr: any) {
            argParseError = true;
            console.warn(`[agent] Malformed tool args for ${toolName} (repair failed):`, toolCall.function.arguments?.substring(0, 200));
          }
        }

        sendSSE(res, "tool_call_start", {
          id: toolCall.id,
          name: toolName,
          arguments: toolCall.function.arguments,
        });

        let toolOutput: string;
        let isError = false;

        // If JSON args were malformed, skip execution and tell LLM to fix
        if (argParseError) {
          toolOutput = `Error: Your tool call arguments for '${toolName}' contained malformed JSON. The raw arguments were: ${(toolCall.function.arguments || "").substring(0, 500)}. Please fix the JSON and try again.`;
          isError = true;
        } else try {
          // ─── Pre-tool hooks (matches original HookRunner.run_pre_tool_use) ──
          const preHookResult = await runPreToolHooks(toolName, sessionId, toolArgs, workDir);

          if (!preHookResult.allowed) {
            // Hook denied the tool execution (exit code 2 = deny)
            toolOutput = preHookResult.message || `Tool '${toolName}' was denied by pre-tool hook`;
            isError = true;
            sendSSE(res, "permission_denied", {
              toolName,
              toolCallId: toolCall.id,
              reason: toolOutput,
              needsPrompt: false,
            });
          } else {
            // Execute the tool with the correct working directory
            const toolResult = await executeTool(toolName, toolArgs, sessionId, workDir);
            if (toolResult.isError && toolResult.output.includes("needs one-time approval")) {
              sendSSE(res, "permission_prompt", {
                toolName,
                toolCallId: toolCall.id,
                reason: toolResult.output,
              });
            }
            toolOutput = toolResult.output;
            isError = toolResult.isError || false;

            // Merge pre-hook feedback (matches original merge_hook_feedback)
            if (preHookResult.message) {
              toolOutput = mergeHookFeedback([preHookResult.message], toolOutput, false);
            }

            // ─── Post-tool hooks (matches original HookRunner.run_post_tool_use) ──
            const postHookResult = await runPostToolHooks(toolName, sessionId, toolResult, workDir);
            toolOutput = postHookResult.output;
            isError = postHookResult.isError || false;
          }
        } catch (toolExecError: any) {
          // ─── Bug #3 fix: Tool exception → error result for LLM ──────
          // Original claw-code: tool errors become tool results, NOT loop breaks.
          // The LLM sees the error and can try a different approach.
          console.error(`[agent] Tool '${toolName}' threw exception:`, toolExecError.message);
          toolOutput = `Tool execution error: ${toolExecError.message}`;
          isError = true;
        }

        // No error classification or guidance injection.
        // The model receives raw error output and decides how to handle it.
        // This matches the original claw-code behavior.

        sendSSE(res, "tool_result", {
          toolCallId: toolCall.id,
          toolName,
          output: toolOutput,
          isError,
          durationMs: 0,
        });

        // ─── Special SSE events for interactive tools ────────────────

        // SendUserMessage / Brief: emit SSE for frontend display but DO NOT break the loop.
        // Original claw-code does NOT stop on SendUserMessage — the model can send
        // progress updates ("checking...", "found vulnerability...") AND continue working.
        // Breaking here was the #1 cause of the agent stopping mid-task.
        if ((toolName === "SendUserMessage" || toolName === "Brief" || toolName === "ask_user") && !isError) {
          sendSSE(res, "assistant_message", {
            message: toolArgs.message || toolArgs.question || "",
            attachments: toolArgs.attachments || [],
          });
        }

        // Plan/Todo tools: emit plan state updates
        if (["TodoWrite", "plan_create", "plan_update", "enter_plan_mode", "exit_plan_mode"].includes(toolName)) {
          const updatedPlan = getPlanMode(sessionId);
          sendSSE(res, "plan_update", {
            active: updatedPlan.active,
            steps: updatedPlan.steps,
          });
        }

        // Add tool result to conversation for the LLM to process
        const toolResultMsg: AgentMessage = {
          role: "tool",
          content: toolOutput,
          tool_call_id: toolCall.id,
          name: toolName,
        };
        conversationMessages.push(toolResultMsg);
        toolResultMessages.push(toolResultMsg);
      }

      // No consecutive error detection — the model handles errors naturally.
      // MAX_ITERATIONS (200) is the ultimate safety net.
      // SendUserMessage does NOT break the loop (matches original).

      // ─── Proactive auto-compact check ─────────────────────────────
      // Check if conversation is approaching context window limit and compact proactively
      const estimatedTokens = estimateConversationTokens(conversationMessages);
      // contextWindow already computed above (line 397)
      const contextUsagePercent = Math.round((estimatedTokens / contextWindow) * 100);

      // Emit context usage SSE for frontend tracking
      sendSSE(res, "context_usage", {
        estimatedTokens,
        contextWindow,
        usagePercent: contextUsagePercent,
        messageCount: conversationMessages.length,
      });

      // Proactive compaction at 80% context usage
      if (contextUsagePercent >= 80) {
        console.log(`[agent] Context at ${contextUsagePercent}% — proactive auto-compact`);
        sendSSE(res, "status", {
          status: "compacting",
          message: `Context at ${contextUsagePercent}% — auto-compacting to free space...`,
        });

        try {
          const session = agentMessagesToSession(conversationMessages);

          // LLM-based summarization for proactive compaction
          const llmFetchProactive = async (msgs: Array<{role: string; content: string}>) => {
            const summaryResp = await fetch(apiConfig.url, {
              method: "POST",
              headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` },
              body: JSON.stringify({
                model: apiConfig.model,
                messages: msgs,
                max_tokens: 2000,
                temperature: 0.3,
                stream: false,
              }),
            });
            if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`);
            const json = await summaryResp.json();
            return json.choices?.[0]?.message?.content || "";
          };

          const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetchProactive);
          if (compactResult.removedMessageCount > 0) {
            const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession);
            conversationMessages.length = 0;
            // CRITICAL: Re-prepend original system prompt before compacted summary.
            conversationMessages.push({ role: "system", content: systemPrompt });
            conversationMessages.push(...compactedAgentMessages);

            // Inject current todo/plan state so the agent doesn't lose its plan after compaction
            const todoState = (() => {
              try {
                const executor = require("../tools/executor");
                const plan = executor.getPlanMode(sessionId);
                const todos = executor.todoLists?.get?.(sessionId) || [];
                let state = "";
                if (todos.length > 0) {
                  state += "\n\n[PRESERVED TODO LIST]\n" + todos.map((t: any, i: number) => {
                    const icon = t.status === "completed" ? "\u2713" : t.status === "in_progress" ? "\u25cf" : "\u25cb";
                    return `  ${icon} ${i + 1}. ${t.content} [${t.status}]`;
                  }).join("\n");
                }
                if (plan?.active && plan.steps?.length > 0) {
                  state += "\n\n[PRESERVED PLAN]\n" + plan.steps.map((s: any) => {
                    const icon = s.status === "done" ? "\u2713" : s.status === "in_progress" ? "\u25cf" : "\u25a1";
                    return `  ${icon} ${s.id}. ${s.text} [${s.status}]`;
                  }).join("\n");
                }
                return state;
              } catch { return ""; }
            })();

            if (todoState) {
              // Append todo state to the last user/system message so the model sees it
              const lastMsg = conversationMessages[conversationMessages.length - 1];
              if (lastMsg && typeof lastMsg.content === "string") {
                lastMsg.content += todoState;
              }
            }

            sendSSE(res, "auto_compact", {
              removedCount: compactResult.removedMessageCount,
              keptCount: conversationMessages.length,
              summary: compactResult.formattedSummary,
            });
            console.log(`[agent] Proactive compact: removed ${compactResult.removedMessageCount} messages`);
          }
        } catch (compactErr: any) {
          console.error(`[agent] Proactive compact failed (non-fatal):`, compactErr.message);
        }
      }

      // ─── Buddy events SSE ────────────────────────────────────────────
      // Emit buddy_event for each tool call so frontend can award XP
      for (const toolCall of result.toolCalls) {
        const tn = toolCall.function.name;
        sendSSE(res, "buddy_event", {
          type: "tool_call",
          toolName: tn,
          iteration: iterations,
        });
        // Special buddy events for file creation
        if (tn === "write_file" || tn === "create_file") {
          sendSSE(res, "buddy_event", {
            type: "file_created",
            toolName: tn,
            iteration: iterations,
          });
        }
      }

      // Continue the loop — LLM will see tool results and decide next action
      sendSSE(res, "status", {
        status: "thinking",
        message: `Processing tool results (iteration ${iterations}, context: ${contextUsagePercent}%)...`,
      });
    } catch (error: any) {
      // ─── Bug #2 fix: Distinguish fatal vs transient errors ────────
      // Only AbortError and unrecoverable errors should break the loop.
      // Stream/fetch errors are already handled above with retry logic.
      if (error.name === "AbortError" || signal?.aborted) {
        sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" });
        break;
      }
      // For other errors, log and break (these are truly unexpected)
      console.error(`[agent] Unexpected error in agent loop:`, error.message, error.stack);
      sendSSE(res, "error", { message: error.message || "Unknown error" });
      break;
    }
  }

  if (iterations >= MAX_ITERATIONS) {
    sendSSE(res, "error", { message: `Maximum iterations (${MAX_ITERATIONS}) reached. Use /compact to reduce context and continue.` });
  }

  // ─── Buddy: session_completed event ─────────────────────────────────
  // Emit session_completed so Buddy can award XP for finishing a turn
  sendSSE(res, "buddy_event", {
    type: "session_completed",
    iterations,
    toolCallCount: toolResultMessages.length,
  });

  // Calculate cost using UsageTracker (matches original)
  const cumulativeUsage = usageTracker.cumulativeUsage();
  const modelPricing = pricingForModel(apiConfig.model) ?? defaultSonnetTierPricing();
  const costEstimate = estimateCostUsdWithPricing(cumulativeUsage, modelPricing);
  totalCost = totalCostUsd(costEstimate);

  // Emit usage summary lines (matches original summary_lines_for_model)
  const usageSummary = summaryLinesForModel(cumulativeUsage, "session", apiConfig.model);
  sendSSE(res, "usage", {
    promptTokens: totalPromptTokens,
    completionTokens: totalCompletionTokens,
    totalTokens: totalPromptTokens + totalCompletionTokens,
    cost: totalCost,
    cacheCreationTokens: cumulativeUsage.cache_creation_input_tokens,
    cacheReadTokens: cumulativeUsage.cache_read_input_tokens,
    usageSummary,
    turns: usageTracker.turns(),
    formattedCost: formatUsd(totalCost),
  });

  return {
    finalMessages: conversationMessages.filter((m) => m.role !== "system"),
    totalPromptTokens,
    totalCompletionTokens,
    totalCost,
    model: apiConfig.model,
  };
}

/**
 * Process a streaming response from the LLM API (OpenAI-compatible SSE format)
 */
async function processStream(
  response: globalThis.Response,
  res: Response,
  signal?: AbortSignal
): Promise<{
  content: string;
  toolCalls: Array<{
    id: string;
    type: "function";
    function: { name: string; arguments: string };
  }>;
  usage?: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number };
}> {
  const reader = response.body?.getReader();
  if (!reader) throw new Error("No response body");

  const decoder = new TextDecoder();
  let content = "";
  const toolCalls: Map<
    number,
    { id: string; type: "function"; function: { name: string; arguments: string } }
  > = new Map();
  let usage: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number } | undefined;
  let buffer = "";

  try {
    while (true) {
      if (signal?.aborted) break;

      const { done, value } = await reader.read();
      if (done) break;

      buffer += decoder.decode(value, { stream: true });

      // Process complete SSE lines
      const lines = buffer.split("\n");
      buffer = lines.pop() || "";

      for (const line of lines) {
        if (!line.startsWith("data: ")) continue;
        const data = line.slice(6).trim();
        if (data === "[DONE]") continue;

        try {
          const chunk = JSON.parse(data);
          const delta = chunk.choices?.[0]?.delta;

          // Detect API errors returned inside the SSE stream (e.g. DeepInfra "Operation not allowed")
          if (chunk.error) {
            const errMsg = chunk.error.message || chunk.error.type || JSON.stringify(chunk.error);
            console.error(`[agent] API error in stream: ${errMsg}`);
            throw new Error(`API error in stream: ${errMsg}`);
          }

          if (!delta) {
            if (chunk.usage) {
              usage = {
                prompt_tokens: chunk.usage.prompt_tokens || 0,
                completion_tokens: chunk.usage.completion_tokens || 0,
                cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0,
                cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0,
              };
            }
            continue;
          }

          // Reasoning/thinking content (Qwen3 Thinking, DeepSeek-R1)
          // These models return reasoning in delta.reasoning_content before the actual response
          if (delta.reasoning_content) {
            sendSSE(res, "thinking_delta", { text: delta.reasoning_content });
          }

          // Text content streaming
          if (delta.content) {
            content += delta.content;
            sendSSE(res, "text_delta", { text: delta.content });
          }

          // Tool call streaming
          if (delta.tool_calls) {
            for (const tc of delta.tool_calls) {
              const idx = tc.index ?? 0;
              if (!toolCalls.has(idx)) {
                toolCalls.set(idx, {
                  id: tc.id || `call_${idx}_${Date.now()}`,
                  type: "function",
                  function: { name: tc.function?.name || "", arguments: "" },
                });
              }
              const existing = toolCalls.get(idx)!;
              if (tc.id) existing.id = tc.id;
              if (tc.function?.name) existing.function.name = tc.function.name;
              if (tc.function?.arguments) {
                existing.function.arguments += tc.function.arguments;
                sendSSE(res, "tool_call_delta", {
                  id: existing.id,
                  name: existing.function.name,
                  arguments: tc.function.arguments,
                });
              }
            }
          }

          // Usage info
          if (chunk.usage) {
            usage = {
              prompt_tokens: chunk.usage.prompt_tokens || 0,
              completion_tokens: chunk.usage.completion_tokens || 0,
              cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0,
              cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0,
            };
          }
        } catch (parseErr: any) {
          // Re-throw API errors (these are NOT malformed JSON — they're real errors)
          if (parseErr?.message?.startsWith('API error in stream:')) {
            throw parseErr;
          }
          // Skip genuinely malformed JSON chunks (partial SSE data, etc.)
        }
      }
    }
  } finally {
    reader.releaseLock();
  }

  // Process remaining buffer (last line without trailing \n)
  if (buffer.trim() && buffer.startsWith("data: ")) {
    const data = buffer.slice(6).trim();
    if (data !== "[DONE]") {
      try {
        const chunk = JSON.parse(data);
        const delta = chunk.choices?.[0]?.delta;
        if (delta?.content) {
          content += delta.content;
          sendSSE(res, "text_delta", { text: delta.content });
        }
        if (delta?.tool_calls) {
          for (const tc of delta.tool_calls) {
            const idx = tc.index ?? 0;
            if (!toolCalls.has(idx)) {
              toolCalls.set(idx, {
                id: tc.id || `call_${idx}_${Date.now()}`,
                type: "function",
                function: { name: tc.function?.name || "", arguments: "" },
              });
            }
            const existing = toolCalls.get(idx)!;
            if (tc.id) existing.id = tc.id;
            if (tc.function?.name) existing.function.name = tc.function.name;
            if (tc.function?.arguments) existing.function.arguments += tc.function.arguments;
          }
        }
        if (chunk.usage) {
          usage = {
            prompt_tokens: chunk.usage.prompt_tokens || 0,
            completion_tokens: chunk.usage.completion_tokens || 0,
            cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0,
            cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0,
          };
        }
        // Check finish_reason for truncation
        const finishReason = chunk.choices?.[0]?.finish_reason;
        if (finishReason === "length") {
          console.warn("[agent] Response truncated (finish_reason=length) — tool call args may be incomplete");
        }
      } catch { /* skip malformed */ }
    }
  }

  // Original claw-code retries on empty response instead of throwing.
  if (content.length === 0 && toolCalls.size === 0) {
    console.warn("[agent] LLM returned empty response — will be retried by agent loop");
  }

  return {
    content,
    toolCalls: Array.from(toolCalls.values()),
    usage,
  };
}

/**
 * Estimate cost based on model and token counts
 */
function estimateCost(model: string, promptTokens: number, completionTokens: number): number {
  // Pricing per 1M tokens — aligned with original claw-code model registry
  const pricing: Record<string, { input: number; output: number }> = {
    // Claw API / Anthropic
    "claude-opus-4-6": { input: 15.00, output: 75.00 },
    "claude-sonnet-4-6": { input: 3.00, output: 15.00 },
    "claude-haiku-4-5-20251213": { input: 0.80, output: 4.00 },
    // xAI Grok
    "grok-3": { input: 3.00, output: 15.00 },
    "grok-3-mini": { input: 0.30, output: 0.50 },
    "grok-2": { input: 2.00, output: 10.00 },
    // OpenAI
    "gpt-5.4": { input: 2.50, output: 15.00 },
    "gpt-5.4-mini": { input: 0.40, output: 1.60 },
    "gpt-5.3-codex": { input: 2.50, output: 10.00 },
    "gpt-4.1": { input: 2.00, output: 8.00 },
    "gpt-4.1-mini": { input: 0.40, output: 1.60 },
    "o3": { input: 10.00, output: 40.00 },
    "o4-mini": { input: 1.10, output: 4.40 },
    // HuggingFace Inference API (free tier = $0, Pro tier = included in subscription)
    "XiaomiMiMo/MiMo-V2-Flash": { input: 0.00, output: 0.00 },
    "Qwen/Qwen3-Coder-Next": { input: 0.00, output: 0.00 },
    "Qwen/Qwen3-8B": { input: 0.00, output: 0.00 },
    "Qwen/Qwen3-Coder-30B-A3B-Instruct": { input: 0.00, output: 0.00 },
    "meta-llama/Llama-3.3-70B-Instruct": { input: 0.00, output: 0.00 },
    "deepseek-ai/DeepSeek-V3.2": { input: 0.00, output: 0.00 },
    "deepseek-ai/DeepSeek-R1": { input: 0.00, output: 0.00 },
    // OpenRouter variants
    "anthropic/claude-opus-4-6": { input: 15.00, output: 75.00 },
    "anthropic/claude-sonnet-4-6": { input: 3.00, output: 15.00 },
    "google/gemini-2.5-pro": { input: 1.25, output: 10.00 },
    "google/gemini-2.5-flash": { input: 0.15, output: 0.60 },
  };

  const rates = pricing[model] || { input: 1.00, output: 3.00 };
  return (promptTokens * rates.input + completionTokens * rates.output) / 1_000_000;
}