/** * Claw Agent Runtime — the core agentic conversation loop. * Handles streaming responses, tool calls, and multi-turn conversations. */ import { ENV } from "../_core/env"; import { buildSystemPrompt, TOOL_DEFINITIONS } from "./system-prompt"; import { executeTool, getPlanMode, runPreToolHooks, runPostToolHooks, initializeMcpFromConfig, getMcpManager } from "../tools/executor"; import { compactSession, compactSessionWithLLM, shouldCompact, estimateSessionTokens, dbMessagesToSession, DEFAULT_COMPACTION_CONFIG } from "./compact"; import type { Session, ConversationMessage as CompactMessage, CompactionConfig } from "./compact"; import { UsageTracker, pricingForModel, defaultSonnetTierPricing, estimateCostUsdWithPricing, totalCostUsd, formatUsd, summaryLinesForModel } from "./usage"; import type { TokenUsage } from "./usage"; import type { Response } from "express"; import { execSync } from "child_process"; // In original claw-code, max_iterations defaults to usize::MAX (effectively unlimited). // Auto-compact is triggered on context overflow (400 error) — matches original compact() method. // Context window sizes for known models (used for proactive compaction) const MODEL_CONTEXT_WINDOWS: Record = { // Xiaomi MiMo "XiaomiMiMo/MiMo-V2-Flash": 262144, // Qwen models (DeepInfra + HuggingFace) "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo": 262144, "Qwen/Qwen3-Coder-480B-A35B-Instruct": 262144, "Qwen/Qwen3-235B-A22B-Instruct-2507": 262144, "Qwen/Qwen3-235B-A22B-Thinking-2507": 262144, "Qwen/Qwen3.5-397B-A17B": 262144, "Qwen/Qwen3.5-122B-A10B": 262144, "Qwen/Qwen3-Coder-Next": 131072, "Qwen/Qwen3-32B": 40960, "Qwen/Qwen3-8B": 32768, "Qwen/Qwen3-Coder-30B-A3B-Instruct": 131072, // Meta Llama "meta-llama/Llama-3.3-70B-Instruct": 131072, "meta-llama/Llama-4-Maverick-17B-128E": 1048576, "meta-llama/Llama-4-Scout-17B-16E": 327680, // DeepSeek "deepseek-ai/DeepSeek-V3.2": 163840, "deepseek-ai/DeepSeek-V3.1": 163840, "deepseek-ai/DeepSeek-R1": 131072, "deepseek-ai/DeepSeek-R1-0528": 163840, // NVIDIA Nemotron "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B": 262144, // StepFun "stepfun-ai/Step-3.5-Flash": 262144, // NousResearch (uncensored) "NousResearch/Hermes-3-Llama-3.1-70B": 131072, "NousResearch/Hermes-3-Llama-3.1-405B": 131072, // Anthropic "claude-opus-4-6": 200000, "claude-sonnet-4-6": 200000, // OpenAI "gpt-5.4": 1048576, "gpt-4.1": 1048576, // xAI "grok-3": 131072, // Google "google/gemini-2.5-flash": 1000000, "google/gemini-2.5-pro": 1000000, }; const DEFAULT_CONTEXT_WINDOW = 131072; /** * Convert agent messages to compact.ts Session format for compaction. */ function agentMessagesToSession(messages: AgentMessage[]): Session { return dbMessagesToSession( messages.map((m) => ({ role: m.role, content: m.content || "", toolName: m.name || null, toolCallId: m.tool_call_id || null, })) ); } /** * Convert compacted Session back to AgentMessage[] format. */ function sessionToAgentMessages(session: Session): AgentMessage[] { return session.messages.map((msg) => { const agentMsg: AgentMessage = { role: msg.role, content: msg.blocks .filter((b) => b.type === "text") .map((b) => b.text || "") .join("\n") || null, }; // Reconstruct tool_calls from tool_use blocks const toolUseBlocks = msg.blocks.filter((b) => b.type === "tool_use"); if (toolUseBlocks.length > 0) { agentMsg.tool_calls = toolUseBlocks.map((b, i) => ({ id: `compacted_${i}_${Date.now()}`, type: "function" as const, function: { name: b.name || "unknown", arguments: b.input || "{}", }, })); } // Reconstruct tool result fields const toolResultBlock = msg.blocks.find((b) => b.type === "tool_result"); if (toolResultBlock) { agentMsg.name = toolResultBlock.toolName; agentMsg.content = toolResultBlock.output || ""; } return agentMsg; }); } /** * Estimate total tokens in the conversation (simple heuristic: ~4 chars per token). */ function estimateConversationTokens(messages: AgentMessage[]): number { let total = 0; for (const msg of messages) { total += Math.ceil((msg.content?.length || 0) / 4) + 4; // +4 for role/overhead if (msg.tool_calls) { for (const tc of msg.tool_calls) { total += Math.ceil((tc.function.name.length + tc.function.arguments.length) / 4) + 4; } } } return total; } interface AgentMessage { role: "system" | "user" | "assistant" | "tool"; content: string | null; tool_calls?: Array<{ id: string; type: "function"; function: { name: string; arguments: string }; }>; tool_call_id?: string; name?: string; } interface AgentConfig { model: string; apiProvider: string; apiKey?: string | null; apiBaseUrl?: string | null; maxTokens: number; temperature: number; topP: number; systemPrompt?: string | null; memory?: string | null; workDir?: string; effortLevel?: "low" | "medium" | "high"; maxIterations?: number; } /** * TurnSummary — matches original conversation.rs TurnSummary struct. * Returned after each complete agent turn. */ export interface TurnSummary { assistantMessages: AgentMessage[]; toolResults: AgentMessage[]; iterations: number; usage: TokenUsage; } /** * Read git status (matches original read_git_status from prompt.rs) */ function readGitStatus(cwd: string): string | null { try { const output = execSync("git --no-optional-locks status --short --branch", { cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], }).trim(); return output || null; } catch { return null; } } /** * Read git diff (matches original read_git_diff from prompt.rs) */ function readGitDiff(cwd: string): string | null { try { const sections: string[] = []; try { const staged = execSync("git diff --cached", { cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], }).trim(); if (staged) sections.push(`Staged changes:\n${staged}`); } catch {} try { const unstaged = execSync("git diff", { cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], }).trim(); if (unstaged) sections.push(`Unstaged changes:\n${unstaged}`); } catch {} return sections.length > 0 ? sections.join("\n\n") : null; } catch { return null; } } /** * Merge hook feedback into tool output — matches original merge_hook_feedback() */ function mergeHookFeedback(hookMessages: string[], output: string, denied: boolean): string { if (hookMessages.length === 0) return output; const sections: string[] = []; if (output.trim()) sections.push(output); const label = denied ? "Hook feedback (denied)" : "Hook feedback"; sections.push(`${label}:\n${hookMessages.join("\n")}`); return sections.join("\n\n"); } const DEFAULT_CONFIG: AgentConfig = { model: process.env.DEFAULT_MODEL || "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", apiProvider: "deepinfra", maxTokens: 32768, // Qwen3-Coder supports up to 65k output temperature: 0.5, // Lower temp = more focused/deterministic agent behavior topP: 0.95, // Slightly restricted for more coherent tool calls workDir: process.env.WORKSPACE_DIR || "/home/ubuntu", effortLevel: "high", }; /** * Retry config for transient API errors. * - 429 (rate limit): retry INFINITELY every 2 seconds until it works. * - 500/502/503 (server errors): retry INFINITELY every 2 seconds. * - Network errors: retry INFINITELY every 2 seconds. * We NEVER give up on transient errors — just keep trying. */ const RETRY_DELAY_MS = 2000; // fixed 2 second interval — simple and reliable /** * Resolve the API URL and key based on provider config */ function resolveApiConfig(config: AgentConfig) { // ─── HARDCODED FALLBACK — always works even if settings are corrupted ─── const FALLBACK_URL = "https://api.deepinfra.com/v1/openai"; const FALLBACK_MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo"; // Resolve model aliases (used for both default and custom paths) const aliasMap: Record = { // Xiaomi MiMo mimo: "XiaomiMiMo/MiMo-V2-Flash", "mimo-flash": "XiaomiMiMo/MiMo-V2-Flash", "mimo-v2": "XiaomiMiMo/MiMo-V2-Flash", // Qwen models (DeepInfra) "qwen-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", "qwen-coder-turbo": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", "qwen-coder-480b": "Qwen/Qwen3-Coder-480B-A35B-Instruct", "qwen3-235b": "Qwen/Qwen3-235B-A22B-Instruct-2507", "qwen3-thinking": "Qwen/Qwen3-235B-A22B-Thinking-2507", "qwen3.5": "Qwen/Qwen3.5-397B-A17B", "qwen3-32b": "Qwen/Qwen3-32B", "qwen3-8b": "Qwen/Qwen3-8B", "qwen3-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", // Llama llama: "meta-llama/Llama-3.3-70B-Instruct", "llama-70b": "meta-llama/Llama-3.3-70B-Instruct", "llama-4": "meta-llama/Llama-4-Maverick-17B-128E", // DeepSeek deepseek: "deepseek-ai/DeepSeek-V3.2", "deepseek-r1": "deepseek-ai/DeepSeek-R1-0528", "deepseek-v3": "deepseek-ai/DeepSeek-V3.2", // NVIDIA nemotron: "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B", // StepFun step: "stepfun-ai/Step-3.5-Flash", "step-flash": "stepfun-ai/Step-3.5-Flash", // Uncensored hermes: "NousResearch/Hermes-3-Llama-3.1-70B", "hermes-405b": "NousResearch/Hermes-3-Llama-3.1-405B", uncensored: "NousResearch/Hermes-3-Llama-3.1-70B", // OpenAI GPT-5.x family "gpt5": "gpt-5.4", "gpt-5": "gpt-5.4", "gpt54": "gpt-5.4", // Anthropic aliases opus: "claude-opus-4-6", sonnet: "claude-sonnet-4-6", haiku: "claude-haiku-4-5-20251213", // xAI grok: "grok-3", "grok-3": "grok-3", // Google gemini: "google/gemini-2.5-flash", "gemini-pro": "google/gemini-2.5-pro", }; // Treat empty, null, masked, or built-in providers as "use server default" const hasCustomKey = config.apiKey && config.apiKey.length > 4 && !config.apiKey.startsWith("••••"); if (config.apiProvider === "claw" || config.apiProvider === "default" || config.apiProvider === "huggingface" || config.apiProvider === "deepinfra" || !hasCustomKey) { const defaultModel = process.env.DEFAULT_MODEL || FALLBACK_MODEL; const resolvedModel = aliasMap[config.model] || config.model || defaultModel; // Use BUILT_IN_FORGE_API_URL from env — HuggingFace router or OpenAI const baseUrl = (ENV.forgeApiUrl || FALLBACK_URL).replace(/\/$/, ""); const apiKey = ENV.forgeApiKey || process.env.BUILT_IN_FORGE_API_KEY || ""; console.log(`[agent] resolveApiConfig: using server default. URL=${baseUrl}, model=${resolvedModel}, hasKey=${!!apiKey}`); return { url: `${baseUrl}/chat/completions`, key: apiKey, model: resolvedModel || FALLBACK_MODEL, }; } // Custom provider path — user has their own API key let baseUrl = config.apiBaseUrl || ""; if (!baseUrl) { const providers: Record = { deepinfra: "https://api.deepinfra.com/v1/openai", huggingface: "https://router.huggingface.co/v1", xai: "https://api.x.ai/v1", openrouter: "https://openrouter.ai/api/v1", openai: "https://api.openai.com/v1", anthropic: "https://api.anthropic.com/v1", groq: "https://api.groq.com/openai/v1", cerebras: "https://api.cerebras.ai/v1", ollama: "http://localhost:11434/v1", }; baseUrl = providers[config.apiProvider] || FALLBACK_URL; } const resolvedModel = aliasMap[config.model] || config.model || FALLBACK_MODEL; console.log(`[agent] resolveApiConfig: custom provider. URL=${baseUrl}, model=${resolvedModel}`); return { url: `${baseUrl.replace(/\/$/, "")}/chat/completions`, key: config.apiKey, model: resolvedModel, }; } /** * Send an SSE event to the client */ function sendSSE(res: Response, event: string, data: unknown) { try { res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`); } catch { // Connection may be closed } } /** * Run the agentic loop: send messages to LLM, execute tool calls, repeat. * This is the core of the agent — it loops until the LLM stops calling tools. */ export async function runAgentLoop( messages: AgentMessage[], sessionId: number, config: Partial, res: Response, signal?: AbortSignal ): Promise<{ finalMessages: AgentMessage[]; totalPromptTokens: number; totalCompletionTokens: number; totalCost: number; model: string; }> { const cfg = { ...DEFAULT_CONFIG, ...config }; const apiConfig = resolveApiConfig(cfg); const workDir = cfg.workDir || "/home/ubuntu"; // Get plan mode state const planState = getPlanMode(sessionId); // Read git status and diff (matches original ProjectContext::discover_with_git) const gitStatus = readGitStatus(workDir); const gitDiff = readGitDiff(workDir); // Build system prompt with full environment context const systemPrompt = buildSystemPrompt({ memory: cfg.memory, effortLevel: cfg.effortLevel || "high", planMode: planState.active, planSteps: planState.steps, customSystemPrompt: cfg.systemPrompt, workDir, platform: "linux", model: apiConfig.model, gitStatus, gitDiff, }); // Initialize UsageTracker (matches original conversation.rs) const usageTracker = UsageTracker.new(); // Build conversation with system message first const conversationMessages: AgentMessage[] = [ { role: "system", content: systemPrompt }, ...messages.filter((m) => m.role !== "system"), ]; let totalPromptTokens = 0; let totalCompletionTokens = 0; let totalCost = 0; let iterations = 0; let emptyResponseRetries = 0; const MAX_EMPTY_RETRIES = 3; // Safety limit: prevent infinite loops. Original claw-code uses usize::MAX but that // causes runaway loops with Qwen3 which sometimes fails to stop generating. // 200 iterations is more than enough for any real task. const MAX_ITERATIONS = cfg.maxIterations || 200; const assistantMessages: AgentMessage[] = []; const toolResultMessages: AgentMessage[] = []; // ─── Loop detection: minimal safety net ───────────────────────────── // Only detect EXACT same tool+args repeated 5+ times (true infinite loop). // Everything else is handled by MAX_ITERATIONS. const recentToolSignatures: string[] = []; const MAX_EXACT_REPEATS = 5; // ─── MCP Tools Dynamic Injection (matches original claw-code) ────────── // Initialize MCP servers from config and merge discovered tools with static TOOL_DEFINITIONS. // This is how the original claw-code dynamically builds the tool list: // 1. Load MCP server configs from .claw/settings.json // 2. Connect to each server via stdio JSON-RPC // 3. Call tools/list to discover available tools // 4. Prefix tool names as mcp__servername__toolname // 5. Merge with static tool definitions let allTools = [...TOOL_DEFINITIONS]; try { const mcpTools = await initializeMcpFromConfig(workDir); if (mcpTools.length > 0) { const mcpManager = getMcpManager(); if (mcpManager) { const mcpDefs = mcpManager.getToolDefinitions(); // Convert MCP tool format to OpenAI function calling format const mcpToolDefs = mcpDefs.map((t) => ({ type: "function" as const, function: { name: t.name, description: t.description, parameters: t.input_schema || { type: "object", properties: {} }, }, })); allTools = [...TOOL_DEFINITIONS, ...mcpToolDefs]; console.log(`[agent] MCP tools injected: ${mcpDefs.map((t) => t.name).join(", ")}`); sendSSE(res, "status", { status: "mcp_ready", message: `MCP tools loaded: ${mcpDefs.length} tools from ${mcpManager.getConnectedServers().length} servers`, }); } } } catch (err: any) { console.error(`[agent] MCP initialization error (non-fatal):`, err.message); // MCP init failure is non-fatal — agent continues with static tools only } // ─── Context-aware compaction config ────────────────────────────── // Original claw-code uses percentage-based thresholds, not fixed 10k tokens. // We compute the threshold as 70% of the model's context window. const contextWindow = MODEL_CONTEXT_WINDOWS[apiConfig.model] || DEFAULT_CONTEXT_WINDOW; const dynamicCompactionConfig: import("./compact").CompactionConfig = { preserveRecentMessages: DEFAULT_COMPACTION_CONFIG.preserveRecentMessages, maxEstimatedTokens: Math.floor(contextWindow * 0.7), }; sendSSE(res, "status", { status: "thinking", message: "Processing your request..." }); while (iterations < MAX_ITERATIONS) { iterations++; if (signal?.aborted) { sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" }); break; } // Build API request // Determine max_tokens limit based on provider const isDeepInfra = apiConfig.url.includes("deepinfra.com"); const isHuggingFace = apiConfig.url.includes("huggingface.co"); const maxTokensLimit = isHuggingFace ? 32000 : (isDeepInfra ? 65536 : 65536); // Detect if model supports thinking/reasoning mode (Qwen3 Thinking, DeepSeek-R1) const isThinkingModel = apiConfig.model.includes("Thinking") || apiConfig.model.includes("R1"); const payload: Record = { model: apiConfig.model, messages: conversationMessages.map((m) => { const msg: Record = { role: m.role, content: m.content }; if (m.tool_calls) msg.tool_calls = m.tool_calls; if (m.tool_call_id) msg.tool_call_id = m.tool_call_id; if (m.name) msg.name = m.name; return msg; }), tools: allTools, // Match original: always "auto" — let the model decide what tools to use tool_choice: "auto", max_tokens: Math.min(cfg.maxTokens, maxTokensLimit), temperature: isThinkingModel ? 0.6 : cfg.temperature, // thinking models need lower temp top_p: cfg.topP, stream: true, }; // Enable thinking/reasoning for supported models (Qwen3 Thinking, DeepSeek-R1) if (isThinkingModel && isDeepInfra) { (payload as any).extra_body = { enable_thinking: true }; } // Qwen3-Coder: hybrid thinking mode — model decides when to think // DO NOT disable thinking; Qwen3 automatically uses blocks for complex tasks // The /no_think in system prompt is only a soft hint for simple responses, // the model will override it when it needs to reason through tool calls or code // NOTE: Prefill technique (empty assistant message) was removed because it caused // the model to "talk to itself" — seeing an empty assistant message made it think // it already responded and continue generating follow-up messages in a loop. // Anti-censorship is handled purely via system prompt directives instead. sendSSE(res, "message_start", { iteration: iterations }); try { // ─── Retry loop with INFINITE retries for 429, limited for 5xx ───── // 429 = rate limit: the API is telling us to slow down, NOT to give up. // We MUST keep retrying with increasing backoff until it works. // 500/502/503 = server errors: retry up to MAX_SERVER_RETRIES times. let response: globalThis.Response | null = null; let lastErrorText = ""; let lastStatus = 0; let serverRetryCount = 0; let rateLimitRetryCount = 0; while (true) { if (signal?.aborted) break; try { response = await fetch(apiConfig.url, { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}`, }, body: JSON.stringify(payload), signal, }); } catch (fetchErr: any) { // Network error (DNS, connection refused, etc.) — retry every 2s if (signal?.aborted) break; serverRetryCount++; console.error(`[agent] Fetch error (retry #${serverRetryCount}):`, fetchErr.message); sendSSE(res, "status", { status: "retrying", message: `Network error, retrying in 2s... (attempt #${serverRetryCount})`, }); await new Promise((r) => setTimeout(r, RETRY_DELAY_MS)); continue; } if (response.ok) break; lastStatus = response.status; lastErrorText = await response.text(); // ─── 429 Rate Limit: INFINITE retry every 2s ─── if (response.status === 429) { rateLimitRetryCount++; console.log(`[agent] Rate limited (429) — retry #${rateLimitRetryCount} in 2s`); sendSSE(res, "status", { status: "rate_limited", message: `Rate limited by API — retrying in 2s... (attempt #${rateLimitRetryCount})`, }); await new Promise((r) => setTimeout(r, RETRY_DELAY_MS)); response = null; continue; // NEVER give up on 429 } // ─── 500/502/503 Server errors: INFINITE retry every 2s ─── if ([500, 502, 503].includes(response.status)) { serverRetryCount++; console.log(`[agent] Server error ${response.status} — retry #${serverRetryCount} in 2s`); sendSSE(res, "status", { status: "retrying", message: `Server error ${response.status}, retrying in 2s... (attempt #${serverRetryCount})`, }); await new Promise((r) => setTimeout(r, RETRY_DELAY_MS)); response = null; continue; } // Any other error (400, 401, 403, 404, etc.) — don't retry break; } if (!response || !response.ok) { console.error(`[agent] API error ${lastStatus}:`, lastErrorText); console.error(`[agent] Payload model:`, apiConfig.model); console.error(`[agent] Payload messages count:`, (payload.messages as any[]).length); // ─── AUTO-COMPACT on context overflow (400 error) ───────────── if (lastStatus === 400 && (lastErrorText.includes("context_length") || lastErrorText.includes("too many tokens") || lastErrorText.includes("maximum context") || lastErrorText.includes("token limit") || lastErrorText.includes("too long"))) { console.log(`[agent] Context overflow detected — auto-compacting conversation...`); sendSSE(res, "status", { status: "compacting", message: "Context window exceeded — auto-compacting conversation...", }); try { const session = agentMessagesToSession(conversationMessages); // LLM-based summarization: use the same API to produce a real summary const llmFetch = async (msgs: Array<{role: string; content: string}>) => { const summaryResp = await fetch(apiConfig.url, { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` }, body: JSON.stringify({ model: apiConfig.model, messages: msgs, max_tokens: 2000, temperature: 0.3, stream: false, }), }); if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`); const json = await summaryResp.json(); return json.choices?.[0]?.message?.content || ""; }; const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetch); if (compactResult.removedMessageCount > 0) { const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession); conversationMessages.length = 0; conversationMessages.push({ role: "system", content: systemPrompt }); conversationMessages.push(...compactedAgentMessages); console.log(`[agent] Auto-compact (LLM): removed ${compactResult.removedMessageCount} messages, kept ${conversationMessages.length}`); sendSSE(res, "auto_compact", { removedCount: compactResult.removedMessageCount, keptCount: conversationMessages.length, summary: compactResult.formattedSummary, }); continue; // retry with compacted context } else { console.error(`[agent] Auto-compact produced no reduction — breaking`); sendSSE(res, "error", { message: `Context overflow but compaction couldn't reduce further`, details: lastErrorText, }); break; } } catch (compactErr: any) { console.error(`[agent] Auto-compact failed:`, compactErr.message); sendSSE(res, "error", { message: `Context overflow — auto-compact failed: ${compactErr.message}`, details: lastErrorText, }); break; } } // Non-context 400 errors — log details for debugging if (lastStatus === 400) { console.error(`[agent] Full error body:`, lastErrorText); (payload.messages as any[]).forEach((m: any, i: number) => { console.error(`[agent] msg[${i}] role=${m.role} content_type=${typeof m.content} content_len=${String(m.content || '').length} has_tool_calls=${!!m.tool_calls} has_tool_call_id=${!!m.tool_call_id}`); }); } sendSSE(res, "error", { message: `API error: ${lastStatus}${lastStatus === 429 ? ' (rate limit)' : ''} — ${lastErrorText.substring(0, 200)}`, details: lastErrorText, }); break; } // Process streaming response let result: { content: string; toolCalls: Array<{ id: string; type: "function"; function: { name: string; arguments: string } }>; usage?: any }; try { result = await processStream(response, res, signal); } catch (streamErr: any) { // Stream processing error — treat as transient, retry console.error(`[agent] Stream processing error:`, streamErr.message); if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) { sendSSE(res, "status", { status: "retrying", message: `Stream error, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` }); await new Promise(r => setTimeout(r, 1500)); continue; } sendSSE(res, "error", { message: `Stream failed after ${MAX_EMPTY_RETRIES} retries: ${streamErr.message}` }); break; } // ─── Bug #1 fix: Handle empty LLM response with retry ───────── // Original claw-code retries on empty response instead of crashing. // Open-source models via HuggingFace often return empty streams. if (!result.content && result.toolCalls.length === 0) { if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) { console.warn(`[agent] Empty response from LLM — retry ${emptyResponseRetries}/${MAX_EMPTY_RETRIES}`); sendSSE(res, "status", { status: "retrying", message: `Empty response from model, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` }); await new Promise(r => setTimeout(r, 1500)); continue; // retry same iteration } console.error(`[agent] LLM returned empty response ${MAX_EMPTY_RETRIES} times — giving up`); sendSSE(res, "error", { message: `Model returned empty response after ${MAX_EMPTY_RETRIES} retries. Try a different model or reduce context.` }); break; } emptyResponseRetries = 0; // reset on successful response // Track usage with UsageTracker (matches original) if (result.usage) { totalPromptTokens += result.usage.prompt_tokens || 0; totalCompletionTokens += result.usage.completion_tokens || 0; usageTracker.record({ input_tokens: result.usage.prompt_tokens || 0, output_tokens: result.usage.completion_tokens || 0, cache_creation_input_tokens: result.usage.cache_creation_input_tokens || 0, cache_read_input_tokens: result.usage.cache_read_input_tokens || 0, }); } // Add assistant message to conversation const assistantMessage: AgentMessage = { role: "assistant", // Match original: null when no content (even with tool_calls) content: result.content || null, }; if (result.toolCalls && result.toolCalls.length > 0) { assistantMessage.tool_calls = result.toolCalls; } conversationMessages.push(assistantMessage); assistantMessages.push(assistantMessage); // If no tool calls, we're done — the LLM has finished responding. // This matches the original claw-code behavior exactly: // the model decides when to stop by not calling tools. if (!result.toolCalls || result.toolCalls.length === 0) { sendSSE(res, "message_end", { promptTokens: totalPromptTokens, completionTokens: totalCompletionTokens, cost: totalCost, model: apiConfig.model, }); break; } // ─── Minimal loop detection: only catch TRUE infinite loops ─────── // Only break if the EXACT same tool+args is repeated 5+ times. // This is the only safety net beyond MAX_ITERATIONS. // The original claw-code has NO loop detection at all — it trusts the model. const currentToolSig = result.toolCalls.map((tc: any) => `${tc.function.name}:${tc.function.arguments}`).join("|"); recentToolSignatures.push(currentToolSig); if (recentToolSignatures.length > MAX_EXACT_REPEATS) { recentToolSignatures.shift(); } if (recentToolSignatures.length >= MAX_EXACT_REPEATS) { const allSame = recentToolSignatures.every(r => r === recentToolSignatures[0]); if (allSame) { console.warn(`[agent] Infinite loop detected: exact same tool call repeated ${MAX_EXACT_REPEATS} times — breaking`); sendSSE(res, "error", { message: `⚠️ обнаружен бесконечный цикл. попробуй переформулировать запрос`, }); sendSSE(res, "message_end", { promptTokens: totalPromptTokens, completionTokens: totalCompletionTokens, cost: totalCost, model: apiConfig.model, }); break; } } // ─── Execute tool calls ────────────────────────────────────────── // Bug #2+#3 fix: Each tool call is wrapped in its own try-catch. // Original claw-code sends tool errors back to LLM as tool results, // letting the model decide how to handle them. We NEVER break the // loop on a tool error — only on fatal API/stream errors. for (const toolCall of result.toolCalls) { const toolName = toolCall.function.name; let toolArgs: Record = {}; let argParseError = false; try { toolArgs = JSON.parse(toolCall.function.arguments || "{}"); } catch (parseErr: any) { // Try JSON repair before giving up try { const { jsonrepair } = await import("jsonrepair"); const repaired = jsonrepair(toolCall.function.arguments || "{}"); toolArgs = JSON.parse(repaired); console.info(`[agent] Repaired malformed JSON for ${toolName}`); } catch (repairErr: any) { argParseError = true; console.warn(`[agent] Malformed tool args for ${toolName} (repair failed):`, toolCall.function.arguments?.substring(0, 200)); } } sendSSE(res, "tool_call_start", { id: toolCall.id, name: toolName, arguments: toolCall.function.arguments, }); let toolOutput: string; let isError = false; // If JSON args were malformed, skip execution and tell LLM to fix if (argParseError) { toolOutput = `Error: Your tool call arguments for '${toolName}' contained malformed JSON. The raw arguments were: ${(toolCall.function.arguments || "").substring(0, 500)}. Please fix the JSON and try again.`; isError = true; } else try { // ─── Pre-tool hooks (matches original HookRunner.run_pre_tool_use) ── const preHookResult = await runPreToolHooks(toolName, sessionId, toolArgs, workDir); if (!preHookResult.allowed) { // Hook denied the tool execution (exit code 2 = deny) toolOutput = preHookResult.message || `Tool '${toolName}' was denied by pre-tool hook`; isError = true; sendSSE(res, "permission_denied", { toolName, toolCallId: toolCall.id, reason: toolOutput, needsPrompt: false, }); } else { // Execute the tool with the correct working directory const toolResult = await executeTool(toolName, toolArgs, sessionId, workDir); if (toolResult.isError && toolResult.output.includes("needs one-time approval")) { sendSSE(res, "permission_prompt", { toolName, toolCallId: toolCall.id, reason: toolResult.output, }); } toolOutput = toolResult.output; isError = toolResult.isError || false; // Merge pre-hook feedback (matches original merge_hook_feedback) if (preHookResult.message) { toolOutput = mergeHookFeedback([preHookResult.message], toolOutput, false); } // ─── Post-tool hooks (matches original HookRunner.run_post_tool_use) ── const postHookResult = await runPostToolHooks(toolName, sessionId, toolResult, workDir); toolOutput = postHookResult.output; isError = postHookResult.isError || false; } } catch (toolExecError: any) { // ─── Bug #3 fix: Tool exception → error result for LLM ────── // Original claw-code: tool errors become tool results, NOT loop breaks. // The LLM sees the error and can try a different approach. console.error(`[agent] Tool '${toolName}' threw exception:`, toolExecError.message); toolOutput = `Tool execution error: ${toolExecError.message}`; isError = true; } // No error classification or guidance injection. // The model receives raw error output and decides how to handle it. // This matches the original claw-code behavior. sendSSE(res, "tool_result", { toolCallId: toolCall.id, toolName, output: toolOutput, isError, durationMs: 0, }); // ─── Special SSE events for interactive tools ──────────────── // SendUserMessage / Brief: emit SSE for frontend display but DO NOT break the loop. // Original claw-code does NOT stop on SendUserMessage — the model can send // progress updates ("checking...", "found vulnerability...") AND continue working. // Breaking here was the #1 cause of the agent stopping mid-task. if ((toolName === "SendUserMessage" || toolName === "Brief" || toolName === "ask_user") && !isError) { sendSSE(res, "assistant_message", { message: toolArgs.message || toolArgs.question || "", attachments: toolArgs.attachments || [], }); } // Plan/Todo tools: emit plan state updates if (["TodoWrite", "plan_create", "plan_update", "enter_plan_mode", "exit_plan_mode"].includes(toolName)) { const updatedPlan = getPlanMode(sessionId); sendSSE(res, "plan_update", { active: updatedPlan.active, steps: updatedPlan.steps, }); } // Add tool result to conversation for the LLM to process const toolResultMsg: AgentMessage = { role: "tool", content: toolOutput, tool_call_id: toolCall.id, name: toolName, }; conversationMessages.push(toolResultMsg); toolResultMessages.push(toolResultMsg); } // No consecutive error detection — the model handles errors naturally. // MAX_ITERATIONS (200) is the ultimate safety net. // SendUserMessage does NOT break the loop (matches original). // ─── Proactive auto-compact check ───────────────────────────── // Check if conversation is approaching context window limit and compact proactively const estimatedTokens = estimateConversationTokens(conversationMessages); // contextWindow already computed above (line 397) const contextUsagePercent = Math.round((estimatedTokens / contextWindow) * 100); // Emit context usage SSE for frontend tracking sendSSE(res, "context_usage", { estimatedTokens, contextWindow, usagePercent: contextUsagePercent, messageCount: conversationMessages.length, }); // Proactive compaction at 80% context usage if (contextUsagePercent >= 80) { console.log(`[agent] Context at ${contextUsagePercent}% — proactive auto-compact`); sendSSE(res, "status", { status: "compacting", message: `Context at ${contextUsagePercent}% — auto-compacting to free space...`, }); try { const session = agentMessagesToSession(conversationMessages); // LLM-based summarization for proactive compaction const llmFetchProactive = async (msgs: Array<{role: string; content: string}>) => { const summaryResp = await fetch(apiConfig.url, { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` }, body: JSON.stringify({ model: apiConfig.model, messages: msgs, max_tokens: 2000, temperature: 0.3, stream: false, }), }); if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`); const json = await summaryResp.json(); return json.choices?.[0]?.message?.content || ""; }; const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetchProactive); if (compactResult.removedMessageCount > 0) { const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession); conversationMessages.length = 0; // CRITICAL: Re-prepend original system prompt before compacted summary. conversationMessages.push({ role: "system", content: systemPrompt }); conversationMessages.push(...compactedAgentMessages); // Inject current todo/plan state so the agent doesn't lose its plan after compaction const todoState = (() => { try { const executor = require("../tools/executor"); const plan = executor.getPlanMode(sessionId); const todos = executor.todoLists?.get?.(sessionId) || []; let state = ""; if (todos.length > 0) { state += "\n\n[PRESERVED TODO LIST]\n" + todos.map((t: any, i: number) => { const icon = t.status === "completed" ? "\u2713" : t.status === "in_progress" ? "\u25cf" : "\u25cb"; return ` ${icon} ${i + 1}. ${t.content} [${t.status}]`; }).join("\n"); } if (plan?.active && plan.steps?.length > 0) { state += "\n\n[PRESERVED PLAN]\n" + plan.steps.map((s: any) => { const icon = s.status === "done" ? "\u2713" : s.status === "in_progress" ? "\u25cf" : "\u25a1"; return ` ${icon} ${s.id}. ${s.text} [${s.status}]`; }).join("\n"); } return state; } catch { return ""; } })(); if (todoState) { // Append todo state to the last user/system message so the model sees it const lastMsg = conversationMessages[conversationMessages.length - 1]; if (lastMsg && typeof lastMsg.content === "string") { lastMsg.content += todoState; } } sendSSE(res, "auto_compact", { removedCount: compactResult.removedMessageCount, keptCount: conversationMessages.length, summary: compactResult.formattedSummary, }); console.log(`[agent] Proactive compact: removed ${compactResult.removedMessageCount} messages`); } } catch (compactErr: any) { console.error(`[agent] Proactive compact failed (non-fatal):`, compactErr.message); } } // ─── Buddy events SSE ──────────────────────────────────────────── // Emit buddy_event for each tool call so frontend can award XP for (const toolCall of result.toolCalls) { const tn = toolCall.function.name; sendSSE(res, "buddy_event", { type: "tool_call", toolName: tn, iteration: iterations, }); // Special buddy events for file creation if (tn === "write_file" || tn === "create_file") { sendSSE(res, "buddy_event", { type: "file_created", toolName: tn, iteration: iterations, }); } } // Continue the loop — LLM will see tool results and decide next action sendSSE(res, "status", { status: "thinking", message: `Processing tool results (iteration ${iterations}, context: ${contextUsagePercent}%)...`, }); } catch (error: any) { // ─── Bug #2 fix: Distinguish fatal vs transient errors ──────── // Only AbortError and unrecoverable errors should break the loop. // Stream/fetch errors are already handled above with retry logic. if (error.name === "AbortError" || signal?.aborted) { sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" }); break; } // For other errors, log and break (these are truly unexpected) console.error(`[agent] Unexpected error in agent loop:`, error.message, error.stack); sendSSE(res, "error", { message: error.message || "Unknown error" }); break; } } if (iterations >= MAX_ITERATIONS) { sendSSE(res, "error", { message: `Maximum iterations (${MAX_ITERATIONS}) reached. Use /compact to reduce context and continue.` }); } // ─── Buddy: session_completed event ───────────────────────────────── // Emit session_completed so Buddy can award XP for finishing a turn sendSSE(res, "buddy_event", { type: "session_completed", iterations, toolCallCount: toolResultMessages.length, }); // Calculate cost using UsageTracker (matches original) const cumulativeUsage = usageTracker.cumulativeUsage(); const modelPricing = pricingForModel(apiConfig.model) ?? defaultSonnetTierPricing(); const costEstimate = estimateCostUsdWithPricing(cumulativeUsage, modelPricing); totalCost = totalCostUsd(costEstimate); // Emit usage summary lines (matches original summary_lines_for_model) const usageSummary = summaryLinesForModel(cumulativeUsage, "session", apiConfig.model); sendSSE(res, "usage", { promptTokens: totalPromptTokens, completionTokens: totalCompletionTokens, totalTokens: totalPromptTokens + totalCompletionTokens, cost: totalCost, cacheCreationTokens: cumulativeUsage.cache_creation_input_tokens, cacheReadTokens: cumulativeUsage.cache_read_input_tokens, usageSummary, turns: usageTracker.turns(), formattedCost: formatUsd(totalCost), }); return { finalMessages: conversationMessages.filter((m) => m.role !== "system"), totalPromptTokens, totalCompletionTokens, totalCost, model: apiConfig.model, }; } /** * Process a streaming response from the LLM API (OpenAI-compatible SSE format) */ async function processStream( response: globalThis.Response, res: Response, signal?: AbortSignal ): Promise<{ content: string; toolCalls: Array<{ id: string; type: "function"; function: { name: string; arguments: string }; }>; usage?: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number }; }> { const reader = response.body?.getReader(); if (!reader) throw new Error("No response body"); const decoder = new TextDecoder(); let content = ""; const toolCalls: Map< number, { id: string; type: "function"; function: { name: string; arguments: string } } > = new Map(); let usage: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number } | undefined; let buffer = ""; try { while (true) { if (signal?.aborted) break; const { done, value } = await reader.read(); if (done) break; buffer += decoder.decode(value, { stream: true }); // Process complete SSE lines const lines = buffer.split("\n"); buffer = lines.pop() || ""; for (const line of lines) { if (!line.startsWith("data: ")) continue; const data = line.slice(6).trim(); if (data === "[DONE]") continue; try { const chunk = JSON.parse(data); const delta = chunk.choices?.[0]?.delta; // Detect API errors returned inside the SSE stream (e.g. DeepInfra "Operation not allowed") if (chunk.error) { const errMsg = chunk.error.message || chunk.error.type || JSON.stringify(chunk.error); console.error(`[agent] API error in stream: ${errMsg}`); throw new Error(`API error in stream: ${errMsg}`); } if (!delta) { if (chunk.usage) { usage = { prompt_tokens: chunk.usage.prompt_tokens || 0, completion_tokens: chunk.usage.completion_tokens || 0, cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0, cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0, }; } continue; } // Reasoning/thinking content (Qwen3 Thinking, DeepSeek-R1) // These models return reasoning in delta.reasoning_content before the actual response if (delta.reasoning_content) { sendSSE(res, "thinking_delta", { text: delta.reasoning_content }); } // Text content streaming if (delta.content) { content += delta.content; sendSSE(res, "text_delta", { text: delta.content }); } // Tool call streaming if (delta.tool_calls) { for (const tc of delta.tool_calls) { const idx = tc.index ?? 0; if (!toolCalls.has(idx)) { toolCalls.set(idx, { id: tc.id || `call_${idx}_${Date.now()}`, type: "function", function: { name: tc.function?.name || "", arguments: "" }, }); } const existing = toolCalls.get(idx)!; if (tc.id) existing.id = tc.id; if (tc.function?.name) existing.function.name = tc.function.name; if (tc.function?.arguments) { existing.function.arguments += tc.function.arguments; sendSSE(res, "tool_call_delta", { id: existing.id, name: existing.function.name, arguments: tc.function.arguments, }); } } } // Usage info if (chunk.usage) { usage = { prompt_tokens: chunk.usage.prompt_tokens || 0, completion_tokens: chunk.usage.completion_tokens || 0, cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0, cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0, }; } } catch (parseErr: any) { // Re-throw API errors (these are NOT malformed JSON — they're real errors) if (parseErr?.message?.startsWith('API error in stream:')) { throw parseErr; } // Skip genuinely malformed JSON chunks (partial SSE data, etc.) } } } } finally { reader.releaseLock(); } // Process remaining buffer (last line without trailing \n) if (buffer.trim() && buffer.startsWith("data: ")) { const data = buffer.slice(6).trim(); if (data !== "[DONE]") { try { const chunk = JSON.parse(data); const delta = chunk.choices?.[0]?.delta; if (delta?.content) { content += delta.content; sendSSE(res, "text_delta", { text: delta.content }); } if (delta?.tool_calls) { for (const tc of delta.tool_calls) { const idx = tc.index ?? 0; if (!toolCalls.has(idx)) { toolCalls.set(idx, { id: tc.id || `call_${idx}_${Date.now()}`, type: "function", function: { name: tc.function?.name || "", arguments: "" }, }); } const existing = toolCalls.get(idx)!; if (tc.id) existing.id = tc.id; if (tc.function?.name) existing.function.name = tc.function.name; if (tc.function?.arguments) existing.function.arguments += tc.function.arguments; } } if (chunk.usage) { usage = { prompt_tokens: chunk.usage.prompt_tokens || 0, completion_tokens: chunk.usage.completion_tokens || 0, cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0, cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0, }; } // Check finish_reason for truncation const finishReason = chunk.choices?.[0]?.finish_reason; if (finishReason === "length") { console.warn("[agent] Response truncated (finish_reason=length) — tool call args may be incomplete"); } } catch { /* skip malformed */ } } } // Original claw-code retries on empty response instead of throwing. if (content.length === 0 && toolCalls.size === 0) { console.warn("[agent] LLM returned empty response — will be retried by agent loop"); } return { content, toolCalls: Array.from(toolCalls.values()), usage, }; } /** * Estimate cost based on model and token counts */ function estimateCost(model: string, promptTokens: number, completionTokens: number): number { // Pricing per 1M tokens — aligned with original claw-code model registry const pricing: Record = { // Claw API / Anthropic "claude-opus-4-6": { input: 15.00, output: 75.00 }, "claude-sonnet-4-6": { input: 3.00, output: 15.00 }, "claude-haiku-4-5-20251213": { input: 0.80, output: 4.00 }, // xAI Grok "grok-3": { input: 3.00, output: 15.00 }, "grok-3-mini": { input: 0.30, output: 0.50 }, "grok-2": { input: 2.00, output: 10.00 }, // OpenAI "gpt-5.4": { input: 2.50, output: 15.00 }, "gpt-5.4-mini": { input: 0.40, output: 1.60 }, "gpt-5.3-codex": { input: 2.50, output: 10.00 }, "gpt-4.1": { input: 2.00, output: 8.00 }, "gpt-4.1-mini": { input: 0.40, output: 1.60 }, "o3": { input: 10.00, output: 40.00 }, "o4-mini": { input: 1.10, output: 4.40 }, // HuggingFace Inference API (free tier = $0, Pro tier = included in subscription) "XiaomiMiMo/MiMo-V2-Flash": { input: 0.00, output: 0.00 }, "Qwen/Qwen3-Coder-Next": { input: 0.00, output: 0.00 }, "Qwen/Qwen3-8B": { input: 0.00, output: 0.00 }, "Qwen/Qwen3-Coder-30B-A3B-Instruct": { input: 0.00, output: 0.00 }, "meta-llama/Llama-3.3-70B-Instruct": { input: 0.00, output: 0.00 }, "deepseek-ai/DeepSeek-V3.2": { input: 0.00, output: 0.00 }, "deepseek-ai/DeepSeek-R1": { input: 0.00, output: 0.00 }, // OpenRouter variants "anthropic/claude-opus-4-6": { input: 15.00, output: 75.00 }, "anthropic/claude-sonnet-4-6": { input: 3.00, output: 15.00 }, "google/gemini-2.5-pro": { input: 1.25, output: 10.00 }, "google/gemini-2.5-flash": { input: 0.15, output: 0.60 }, }; const rates = pricing[model] || { input: 1.00, output: 3.00 }; return (promptTokens * rates.input + completionTokens * rates.output) / 1_000_000; }