claw-web-v2 / server /runtime /agent.ts
Claw Web
CRITICAL FIX: SendUserMessage no longer breaks agent loop + match original behavior
5d6ab9e
/**
* Claw Agent Runtime — the core agentic conversation loop.
* Handles streaming responses, tool calls, and multi-turn conversations.
*/
import { ENV } from "../_core/env";
import { buildSystemPrompt, TOOL_DEFINITIONS } from "./system-prompt";
import { executeTool, getPlanMode, runPreToolHooks, runPostToolHooks, initializeMcpFromConfig, getMcpManager } from "../tools/executor";
import { compactSession, compactSessionWithLLM, shouldCompact, estimateSessionTokens, dbMessagesToSession, DEFAULT_COMPACTION_CONFIG } from "./compact";
import type { Session, ConversationMessage as CompactMessage, CompactionConfig } from "./compact";
import { UsageTracker, pricingForModel, defaultSonnetTierPricing, estimateCostUsdWithPricing, totalCostUsd, formatUsd, summaryLinesForModel } from "./usage";
import type { TokenUsage } from "./usage";
import type { Response } from "express";
import { execSync } from "child_process";
// In original claw-code, max_iterations defaults to usize::MAX (effectively unlimited).
// Auto-compact is triggered on context overflow (400 error) — matches original compact() method.
// Context window sizes for known models (used for proactive compaction)
const MODEL_CONTEXT_WINDOWS: Record<string, number> = {
// Xiaomi MiMo
"XiaomiMiMo/MiMo-V2-Flash": 262144,
// Qwen models (DeepInfra + HuggingFace)
"Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo": 262144,
"Qwen/Qwen3-Coder-480B-A35B-Instruct": 262144,
"Qwen/Qwen3-235B-A22B-Instruct-2507": 262144,
"Qwen/Qwen3-235B-A22B-Thinking-2507": 262144,
"Qwen/Qwen3.5-397B-A17B": 262144,
"Qwen/Qwen3.5-122B-A10B": 262144,
"Qwen/Qwen3-Coder-Next": 131072,
"Qwen/Qwen3-32B": 40960,
"Qwen/Qwen3-8B": 32768,
"Qwen/Qwen3-Coder-30B-A3B-Instruct": 131072,
// Meta Llama
"meta-llama/Llama-3.3-70B-Instruct": 131072,
"meta-llama/Llama-4-Maverick-17B-128E": 1048576,
"meta-llama/Llama-4-Scout-17B-16E": 327680,
// DeepSeek
"deepseek-ai/DeepSeek-V3.2": 163840,
"deepseek-ai/DeepSeek-V3.1": 163840,
"deepseek-ai/DeepSeek-R1": 131072,
"deepseek-ai/DeepSeek-R1-0528": 163840,
// NVIDIA Nemotron
"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B": 262144,
// StepFun
"stepfun-ai/Step-3.5-Flash": 262144,
// NousResearch (uncensored)
"NousResearch/Hermes-3-Llama-3.1-70B": 131072,
"NousResearch/Hermes-3-Llama-3.1-405B": 131072,
// Anthropic
"claude-opus-4-6": 200000,
"claude-sonnet-4-6": 200000,
// OpenAI
"gpt-5.4": 1048576,
"gpt-4.1": 1048576,
// xAI
"grok-3": 131072,
// Google
"google/gemini-2.5-flash": 1000000,
"google/gemini-2.5-pro": 1000000,
};
const DEFAULT_CONTEXT_WINDOW = 131072;
/**
* Convert agent messages to compact.ts Session format for compaction.
*/
function agentMessagesToSession(messages: AgentMessage[]): Session {
return dbMessagesToSession(
messages.map((m) => ({
role: m.role,
content: m.content || "",
toolName: m.name || null,
toolCallId: m.tool_call_id || null,
}))
);
}
/**
* Convert compacted Session back to AgentMessage[] format.
*/
function sessionToAgentMessages(session: Session): AgentMessage[] {
return session.messages.map((msg) => {
const agentMsg: AgentMessage = {
role: msg.role,
content: msg.blocks
.filter((b) => b.type === "text")
.map((b) => b.text || "")
.join("\n") || null,
};
// Reconstruct tool_calls from tool_use blocks
const toolUseBlocks = msg.blocks.filter((b) => b.type === "tool_use");
if (toolUseBlocks.length > 0) {
agentMsg.tool_calls = toolUseBlocks.map((b, i) => ({
id: `compacted_${i}_${Date.now()}`,
type: "function" as const,
function: {
name: b.name || "unknown",
arguments: b.input || "{}",
},
}));
}
// Reconstruct tool result fields
const toolResultBlock = msg.blocks.find((b) => b.type === "tool_result");
if (toolResultBlock) {
agentMsg.name = toolResultBlock.toolName;
agentMsg.content = toolResultBlock.output || "";
}
return agentMsg;
});
}
/**
* Estimate total tokens in the conversation (simple heuristic: ~4 chars per token).
*/
function estimateConversationTokens(messages: AgentMessage[]): number {
let total = 0;
for (const msg of messages) {
total += Math.ceil((msg.content?.length || 0) / 4) + 4; // +4 for role/overhead
if (msg.tool_calls) {
for (const tc of msg.tool_calls) {
total += Math.ceil((tc.function.name.length + tc.function.arguments.length) / 4) + 4;
}
}
}
return total;
}
interface AgentMessage {
role: "system" | "user" | "assistant" | "tool";
content: string | null;
tool_calls?: Array<{
id: string;
type: "function";
function: { name: string; arguments: string };
}>;
tool_call_id?: string;
name?: string;
}
interface AgentConfig {
model: string;
apiProvider: string;
apiKey?: string | null;
apiBaseUrl?: string | null;
maxTokens: number;
temperature: number;
topP: number;
systemPrompt?: string | null;
memory?: string | null;
workDir?: string;
effortLevel?: "low" | "medium" | "high";
maxIterations?: number;
}
/**
* TurnSummary — matches original conversation.rs TurnSummary struct.
* Returned after each complete agent turn.
*/
export interface TurnSummary {
assistantMessages: AgentMessage[];
toolResults: AgentMessage[];
iterations: number;
usage: TokenUsage;
}
/**
* Read git status (matches original read_git_status from prompt.rs)
*/
function readGitStatus(cwd: string): string | null {
try {
const output = execSync("git --no-optional-locks status --short --branch", {
cwd,
timeout: 5000,
encoding: "utf-8",
stdio: ["pipe", "pipe", "pipe"],
}).trim();
return output || null;
} catch {
return null;
}
}
/**
* Read git diff (matches original read_git_diff from prompt.rs)
*/
function readGitDiff(cwd: string): string | null {
try {
const sections: string[] = [];
try {
const staged = execSync("git diff --cached", {
cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"],
}).trim();
if (staged) sections.push(`Staged changes:\n${staged}`);
} catch {}
try {
const unstaged = execSync("git diff", {
cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"],
}).trim();
if (unstaged) sections.push(`Unstaged changes:\n${unstaged}`);
} catch {}
return sections.length > 0 ? sections.join("\n\n") : null;
} catch {
return null;
}
}
/**
* Merge hook feedback into tool output — matches original merge_hook_feedback()
*/
function mergeHookFeedback(hookMessages: string[], output: string, denied: boolean): string {
if (hookMessages.length === 0) return output;
const sections: string[] = [];
if (output.trim()) sections.push(output);
const label = denied ? "Hook feedback (denied)" : "Hook feedback";
sections.push(`${label}:\n${hookMessages.join("\n")}`);
return sections.join("\n\n");
}
const DEFAULT_CONFIG: AgentConfig = {
model: process.env.DEFAULT_MODEL || "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
apiProvider: "deepinfra",
maxTokens: 32768, // Qwen3-Coder supports up to 65k output
temperature: 0.5, // Lower temp = more focused/deterministic agent behavior
topP: 0.95, // Slightly restricted for more coherent tool calls
workDir: process.env.WORKSPACE_DIR || "/home/ubuntu",
effortLevel: "high",
};
/**
* Retry config for transient API errors.
* - 429 (rate limit): retry INFINITELY every 2 seconds until it works.
* - 500/502/503 (server errors): retry INFINITELY every 2 seconds.
* - Network errors: retry INFINITELY every 2 seconds.
* We NEVER give up on transient errors — just keep trying.
*/
const RETRY_DELAY_MS = 2000; // fixed 2 second interval — simple and reliable
/**
* Resolve the API URL and key based on provider config
*/
function resolveApiConfig(config: AgentConfig) {
// ─── HARDCODED FALLBACK — always works even if settings are corrupted ───
const FALLBACK_URL = "https://api.deepinfra.com/v1/openai";
const FALLBACK_MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo";
// Resolve model aliases (used for both default and custom paths)
const aliasMap: Record<string, string> = {
// Xiaomi MiMo
mimo: "XiaomiMiMo/MiMo-V2-Flash",
"mimo-flash": "XiaomiMiMo/MiMo-V2-Flash",
"mimo-v2": "XiaomiMiMo/MiMo-V2-Flash",
// Qwen models (DeepInfra)
"qwen-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
"qwen-coder-turbo": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
"qwen-coder-480b": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
"qwen3-235b": "Qwen/Qwen3-235B-A22B-Instruct-2507",
"qwen3-thinking": "Qwen/Qwen3-235B-A22B-Thinking-2507",
"qwen3.5": "Qwen/Qwen3.5-397B-A17B",
"qwen3-32b": "Qwen/Qwen3-32B",
"qwen3-8b": "Qwen/Qwen3-8B",
"qwen3-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
// Llama
llama: "meta-llama/Llama-3.3-70B-Instruct",
"llama-70b": "meta-llama/Llama-3.3-70B-Instruct",
"llama-4": "meta-llama/Llama-4-Maverick-17B-128E",
// DeepSeek
deepseek: "deepseek-ai/DeepSeek-V3.2",
"deepseek-r1": "deepseek-ai/DeepSeek-R1-0528",
"deepseek-v3": "deepseek-ai/DeepSeek-V3.2",
// NVIDIA
nemotron: "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B",
// StepFun
step: "stepfun-ai/Step-3.5-Flash",
"step-flash": "stepfun-ai/Step-3.5-Flash",
// Uncensored
hermes: "NousResearch/Hermes-3-Llama-3.1-70B",
"hermes-405b": "NousResearch/Hermes-3-Llama-3.1-405B",
uncensored: "NousResearch/Hermes-3-Llama-3.1-70B",
// OpenAI GPT-5.x family
"gpt5": "gpt-5.4",
"gpt-5": "gpt-5.4",
"gpt54": "gpt-5.4",
// Anthropic aliases
opus: "claude-opus-4-6",
sonnet: "claude-sonnet-4-6",
haiku: "claude-haiku-4-5-20251213",
// xAI
grok: "grok-3",
"grok-3": "grok-3",
// Google
gemini: "google/gemini-2.5-flash",
"gemini-pro": "google/gemini-2.5-pro",
};
// Treat empty, null, masked, or built-in providers as "use server default"
const hasCustomKey = config.apiKey && config.apiKey.length > 4 && !config.apiKey.startsWith("••••");
if (config.apiProvider === "claw" || config.apiProvider === "default" || config.apiProvider === "huggingface" || config.apiProvider === "deepinfra" || !hasCustomKey) {
const defaultModel = process.env.DEFAULT_MODEL || FALLBACK_MODEL;
const resolvedModel = aliasMap[config.model] || config.model || defaultModel;
// Use BUILT_IN_FORGE_API_URL from env — HuggingFace router or OpenAI
const baseUrl = (ENV.forgeApiUrl || FALLBACK_URL).replace(/\/$/, "");
const apiKey = ENV.forgeApiKey || process.env.BUILT_IN_FORGE_API_KEY || "";
console.log(`[agent] resolveApiConfig: using server default. URL=${baseUrl}, model=${resolvedModel}, hasKey=${!!apiKey}`);
return {
url: `${baseUrl}/chat/completions`,
key: apiKey,
model: resolvedModel || FALLBACK_MODEL,
};
}
// Custom provider path — user has their own API key
let baseUrl = config.apiBaseUrl || "";
if (!baseUrl) {
const providers: Record<string, string> = {
deepinfra: "https://api.deepinfra.com/v1/openai",
huggingface: "https://router.huggingface.co/v1",
xai: "https://api.x.ai/v1",
openrouter: "https://openrouter.ai/api/v1",
openai: "https://api.openai.com/v1",
anthropic: "https://api.anthropic.com/v1",
groq: "https://api.groq.com/openai/v1",
cerebras: "https://api.cerebras.ai/v1",
ollama: "http://localhost:11434/v1",
};
baseUrl = providers[config.apiProvider] || FALLBACK_URL;
}
const resolvedModel = aliasMap[config.model] || config.model || FALLBACK_MODEL;
console.log(`[agent] resolveApiConfig: custom provider. URL=${baseUrl}, model=${resolvedModel}`);
return {
url: `${baseUrl.replace(/\/$/, "")}/chat/completions`,
key: config.apiKey,
model: resolvedModel,
};
}
/**
* Send an SSE event to the client
*/
function sendSSE(res: Response, event: string, data: unknown) {
try {
res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
} catch {
// Connection may be closed
}
}
/**
* Run the agentic loop: send messages to LLM, execute tool calls, repeat.
* This is the core of the agent — it loops until the LLM stops calling tools.
*/
export async function runAgentLoop(
messages: AgentMessage[],
sessionId: number,
config: Partial<AgentConfig>,
res: Response,
signal?: AbortSignal
): Promise<{
finalMessages: AgentMessage[];
totalPromptTokens: number;
totalCompletionTokens: number;
totalCost: number;
model: string;
}> {
const cfg = { ...DEFAULT_CONFIG, ...config };
const apiConfig = resolveApiConfig(cfg);
const workDir = cfg.workDir || "/home/ubuntu";
// Get plan mode state
const planState = getPlanMode(sessionId);
// Read git status and diff (matches original ProjectContext::discover_with_git)
const gitStatus = readGitStatus(workDir);
const gitDiff = readGitDiff(workDir);
// Build system prompt with full environment context
const systemPrompt = buildSystemPrompt({
memory: cfg.memory,
effortLevel: cfg.effortLevel || "high",
planMode: planState.active,
planSteps: planState.steps,
customSystemPrompt: cfg.systemPrompt,
workDir,
platform: "linux",
model: apiConfig.model,
gitStatus,
gitDiff,
});
// Initialize UsageTracker (matches original conversation.rs)
const usageTracker = UsageTracker.new();
// Build conversation with system message first
const conversationMessages: AgentMessage[] = [
{ role: "system", content: systemPrompt },
...messages.filter((m) => m.role !== "system"),
];
let totalPromptTokens = 0;
let totalCompletionTokens = 0;
let totalCost = 0;
let iterations = 0;
let emptyResponseRetries = 0;
const MAX_EMPTY_RETRIES = 3;
// Safety limit: prevent infinite loops. Original claw-code uses usize::MAX but that
// causes runaway loops with Qwen3 which sometimes fails to stop generating.
// 200 iterations is more than enough for any real task.
const MAX_ITERATIONS = cfg.maxIterations || 200;
const assistantMessages: AgentMessage[] = [];
const toolResultMessages: AgentMessage[] = [];
// ─── Loop detection: minimal safety net ─────────────────────────────
// Only detect EXACT same tool+args repeated 5+ times (true infinite loop).
// Everything else is handled by MAX_ITERATIONS.
const recentToolSignatures: string[] = [];
const MAX_EXACT_REPEATS = 5;
// ─── MCP Tools Dynamic Injection (matches original claw-code) ──────────
// Initialize MCP servers from config and merge discovered tools with static TOOL_DEFINITIONS.
// This is how the original claw-code dynamically builds the tool list:
// 1. Load MCP server configs from .claw/settings.json
// 2. Connect to each server via stdio JSON-RPC
// 3. Call tools/list to discover available tools
// 4. Prefix tool names as mcp__servername__toolname
// 5. Merge with static tool definitions
let allTools = [...TOOL_DEFINITIONS];
try {
const mcpTools = await initializeMcpFromConfig(workDir);
if (mcpTools.length > 0) {
const mcpManager = getMcpManager();
if (mcpManager) {
const mcpDefs = mcpManager.getToolDefinitions();
// Convert MCP tool format to OpenAI function calling format
const mcpToolDefs = mcpDefs.map((t) => ({
type: "function" as const,
function: {
name: t.name,
description: t.description,
parameters: t.input_schema || { type: "object", properties: {} },
},
}));
allTools = [...TOOL_DEFINITIONS, ...mcpToolDefs];
console.log(`[agent] MCP tools injected: ${mcpDefs.map((t) => t.name).join(", ")}`);
sendSSE(res, "status", {
status: "mcp_ready",
message: `MCP tools loaded: ${mcpDefs.length} tools from ${mcpManager.getConnectedServers().length} servers`,
});
}
}
} catch (err: any) {
console.error(`[agent] MCP initialization error (non-fatal):`, err.message);
// MCP init failure is non-fatal — agent continues with static tools only
}
// ─── Context-aware compaction config ──────────────────────────────
// Original claw-code uses percentage-based thresholds, not fixed 10k tokens.
// We compute the threshold as 70% of the model's context window.
const contextWindow = MODEL_CONTEXT_WINDOWS[apiConfig.model] || DEFAULT_CONTEXT_WINDOW;
const dynamicCompactionConfig: import("./compact").CompactionConfig = {
preserveRecentMessages: DEFAULT_COMPACTION_CONFIG.preserveRecentMessages,
maxEstimatedTokens: Math.floor(contextWindow * 0.7),
};
sendSSE(res, "status", { status: "thinking", message: "Processing your request..." });
while (iterations < MAX_ITERATIONS) {
iterations++;
if (signal?.aborted) {
sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" });
break;
}
// Build API request
// Determine max_tokens limit based on provider
const isDeepInfra = apiConfig.url.includes("deepinfra.com");
const isHuggingFace = apiConfig.url.includes("huggingface.co");
const maxTokensLimit = isHuggingFace ? 32000 : (isDeepInfra ? 65536 : 65536);
// Detect if model supports thinking/reasoning mode (Qwen3 Thinking, DeepSeek-R1)
const isThinkingModel = apiConfig.model.includes("Thinking") || apiConfig.model.includes("R1");
const payload: Record<string, unknown> = {
model: apiConfig.model,
messages: conversationMessages.map((m) => {
const msg: Record<string, unknown> = { role: m.role, content: m.content };
if (m.tool_calls) msg.tool_calls = m.tool_calls;
if (m.tool_call_id) msg.tool_call_id = m.tool_call_id;
if (m.name) msg.name = m.name;
return msg;
}),
tools: allTools,
// Match original: always "auto" — let the model decide what tools to use
tool_choice: "auto",
max_tokens: Math.min(cfg.maxTokens, maxTokensLimit),
temperature: isThinkingModel ? 0.6 : cfg.temperature, // thinking models need lower temp
top_p: cfg.topP,
stream: true,
};
// Enable thinking/reasoning for supported models (Qwen3 Thinking, DeepSeek-R1)
if (isThinkingModel && isDeepInfra) {
(payload as any).extra_body = { enable_thinking: true };
}
// Qwen3-Coder: hybrid thinking mode — model decides when to think
// DO NOT disable thinking; Qwen3 automatically uses <think> blocks for complex tasks
// The /no_think in system prompt is only a soft hint for simple responses,
// the model will override it when it needs to reason through tool calls or code
// NOTE: Prefill technique (empty assistant message) was removed because it caused
// the model to "talk to itself" — seeing an empty assistant message made it think
// it already responded and continue generating follow-up messages in a loop.
// Anti-censorship is handled purely via system prompt directives instead.
sendSSE(res, "message_start", { iteration: iterations });
try {
// ─── Retry loop with INFINITE retries for 429, limited for 5xx ─────
// 429 = rate limit: the API is telling us to slow down, NOT to give up.
// We MUST keep retrying with increasing backoff until it works.
// 500/502/503 = server errors: retry up to MAX_SERVER_RETRIES times.
let response: globalThis.Response | null = null;
let lastErrorText = "";
let lastStatus = 0;
let serverRetryCount = 0;
let rateLimitRetryCount = 0;
while (true) {
if (signal?.aborted) break;
try {
response = await fetch(apiConfig.url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiConfig.key}`,
},
body: JSON.stringify(payload),
signal,
});
} catch (fetchErr: any) {
// Network error (DNS, connection refused, etc.) — retry every 2s
if (signal?.aborted) break;
serverRetryCount++;
console.error(`[agent] Fetch error (retry #${serverRetryCount}):`, fetchErr.message);
sendSSE(res, "status", {
status: "retrying",
message: `Network error, retrying in 2s... (attempt #${serverRetryCount})`,
});
await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
continue;
}
if (response.ok) break;
lastStatus = response.status;
lastErrorText = await response.text();
// ─── 429 Rate Limit: INFINITE retry every 2s ───
if (response.status === 429) {
rateLimitRetryCount++;
console.log(`[agent] Rate limited (429) — retry #${rateLimitRetryCount} in 2s`);
sendSSE(res, "status", {
status: "rate_limited",
message: `Rate limited by API — retrying in 2s... (attempt #${rateLimitRetryCount})`,
});
await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
response = null;
continue; // NEVER give up on 429
}
// ─── 500/502/503 Server errors: INFINITE retry every 2s ───
if ([500, 502, 503].includes(response.status)) {
serverRetryCount++;
console.log(`[agent] Server error ${response.status} — retry #${serverRetryCount} in 2s`);
sendSSE(res, "status", {
status: "retrying",
message: `Server error ${response.status}, retrying in 2s... (attempt #${serverRetryCount})`,
});
await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
response = null;
continue;
}
// Any other error (400, 401, 403, 404, etc.) — don't retry
break;
}
if (!response || !response.ok) {
console.error(`[agent] API error ${lastStatus}:`, lastErrorText);
console.error(`[agent] Payload model:`, apiConfig.model);
console.error(`[agent] Payload messages count:`, (payload.messages as any[]).length);
// ─── AUTO-COMPACT on context overflow (400 error) ─────────────
if (lastStatus === 400 && (lastErrorText.includes("context_length") || lastErrorText.includes("too many tokens") || lastErrorText.includes("maximum context") || lastErrorText.includes("token limit") || lastErrorText.includes("too long"))) {
console.log(`[agent] Context overflow detected — auto-compacting conversation...`);
sendSSE(res, "status", {
status: "compacting",
message: "Context window exceeded — auto-compacting conversation...",
});
try {
const session = agentMessagesToSession(conversationMessages);
// LLM-based summarization: use the same API to produce a real summary
const llmFetch = async (msgs: Array<{role: string; content: string}>) => {
const summaryResp = await fetch(apiConfig.url, {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` },
body: JSON.stringify({
model: apiConfig.model,
messages: msgs,
max_tokens: 2000,
temperature: 0.3,
stream: false,
}),
});
if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`);
const json = await summaryResp.json();
return json.choices?.[0]?.message?.content || "";
};
const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetch);
if (compactResult.removedMessageCount > 0) {
const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession);
conversationMessages.length = 0;
conversationMessages.push({ role: "system", content: systemPrompt });
conversationMessages.push(...compactedAgentMessages);
console.log(`[agent] Auto-compact (LLM): removed ${compactResult.removedMessageCount} messages, kept ${conversationMessages.length}`);
sendSSE(res, "auto_compact", {
removedCount: compactResult.removedMessageCount,
keptCount: conversationMessages.length,
summary: compactResult.formattedSummary,
});
continue; // retry with compacted context
} else {
console.error(`[agent] Auto-compact produced no reduction — breaking`);
sendSSE(res, "error", {
message: `Context overflow but compaction couldn't reduce further`,
details: lastErrorText,
});
break;
}
} catch (compactErr: any) {
console.error(`[agent] Auto-compact failed:`, compactErr.message);
sendSSE(res, "error", {
message: `Context overflow — auto-compact failed: ${compactErr.message}`,
details: lastErrorText,
});
break;
}
}
// Non-context 400 errors — log details for debugging
if (lastStatus === 400) {
console.error(`[agent] Full error body:`, lastErrorText);
(payload.messages as any[]).forEach((m: any, i: number) => {
console.error(`[agent] msg[${i}] role=${m.role} content_type=${typeof m.content} content_len=${String(m.content || '').length} has_tool_calls=${!!m.tool_calls} has_tool_call_id=${!!m.tool_call_id}`);
});
}
sendSSE(res, "error", {
message: `API error: ${lastStatus}${lastStatus === 429 ? ' (rate limit)' : ''}${lastErrorText.substring(0, 200)}`,
details: lastErrorText,
});
break;
}
// Process streaming response
let result: { content: string; toolCalls: Array<{ id: string; type: "function"; function: { name: string; arguments: string } }>; usage?: any };
try {
result = await processStream(response, res, signal);
} catch (streamErr: any) {
// Stream processing error — treat as transient, retry
console.error(`[agent] Stream processing error:`, streamErr.message);
if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) {
sendSSE(res, "status", { status: "retrying", message: `Stream error, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` });
await new Promise(r => setTimeout(r, 1500));
continue;
}
sendSSE(res, "error", { message: `Stream failed after ${MAX_EMPTY_RETRIES} retries: ${streamErr.message}` });
break;
}
// ─── Bug #1 fix: Handle empty LLM response with retry ─────────
// Original claw-code retries on empty response instead of crashing.
// Open-source models via HuggingFace often return empty streams.
if (!result.content && result.toolCalls.length === 0) {
if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) {
console.warn(`[agent] Empty response from LLM — retry ${emptyResponseRetries}/${MAX_EMPTY_RETRIES}`);
sendSSE(res, "status", { status: "retrying", message: `Empty response from model, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` });
await new Promise(r => setTimeout(r, 1500));
continue; // retry same iteration
}
console.error(`[agent] LLM returned empty response ${MAX_EMPTY_RETRIES} times — giving up`);
sendSSE(res, "error", { message: `Model returned empty response after ${MAX_EMPTY_RETRIES} retries. Try a different model or reduce context.` });
break;
}
emptyResponseRetries = 0; // reset on successful response
// Track usage with UsageTracker (matches original)
if (result.usage) {
totalPromptTokens += result.usage.prompt_tokens || 0;
totalCompletionTokens += result.usage.completion_tokens || 0;
usageTracker.record({
input_tokens: result.usage.prompt_tokens || 0,
output_tokens: result.usage.completion_tokens || 0,
cache_creation_input_tokens: result.usage.cache_creation_input_tokens || 0,
cache_read_input_tokens: result.usage.cache_read_input_tokens || 0,
});
}
// Add assistant message to conversation
const assistantMessage: AgentMessage = {
role: "assistant",
// Match original: null when no content (even with tool_calls)
content: result.content || null,
};
if (result.toolCalls && result.toolCalls.length > 0) {
assistantMessage.tool_calls = result.toolCalls;
}
conversationMessages.push(assistantMessage);
assistantMessages.push(assistantMessage);
// If no tool calls, we're done — the LLM has finished responding.
// This matches the original claw-code behavior exactly:
// the model decides when to stop by not calling tools.
if (!result.toolCalls || result.toolCalls.length === 0) {
sendSSE(res, "message_end", {
promptTokens: totalPromptTokens,
completionTokens: totalCompletionTokens,
cost: totalCost,
model: apiConfig.model,
});
break;
}
// ─── Minimal loop detection: only catch TRUE infinite loops ───────
// Only break if the EXACT same tool+args is repeated 5+ times.
// This is the only safety net beyond MAX_ITERATIONS.
// The original claw-code has NO loop detection at all — it trusts the model.
const currentToolSig = result.toolCalls.map((tc: any) => `${tc.function.name}:${tc.function.arguments}`).join("|");
recentToolSignatures.push(currentToolSig);
if (recentToolSignatures.length > MAX_EXACT_REPEATS) {
recentToolSignatures.shift();
}
if (recentToolSignatures.length >= MAX_EXACT_REPEATS) {
const allSame = recentToolSignatures.every(r => r === recentToolSignatures[0]);
if (allSame) {
console.warn(`[agent] Infinite loop detected: exact same tool call repeated ${MAX_EXACT_REPEATS} times — breaking`);
sendSSE(res, "error", {
message: `⚠️ обнаружен бесконечный цикл. попробуй переформулировать запрос`,
});
sendSSE(res, "message_end", {
promptTokens: totalPromptTokens,
completionTokens: totalCompletionTokens,
cost: totalCost,
model: apiConfig.model,
});
break;
}
}
// ─── Execute tool calls ──────────────────────────────────────────
// Bug #2+#3 fix: Each tool call is wrapped in its own try-catch.
// Original claw-code sends tool errors back to LLM as tool results,
// letting the model decide how to handle them. We NEVER break the
// loop on a tool error — only on fatal API/stream errors.
for (const toolCall of result.toolCalls) {
const toolName = toolCall.function.name;
let toolArgs: Record<string, unknown> = {};
let argParseError = false;
try {
toolArgs = JSON.parse(toolCall.function.arguments || "{}");
} catch (parseErr: any) {
// Try JSON repair before giving up
try {
const { jsonrepair } = await import("jsonrepair");
const repaired = jsonrepair(toolCall.function.arguments || "{}");
toolArgs = JSON.parse(repaired);
console.info(`[agent] Repaired malformed JSON for ${toolName}`);
} catch (repairErr: any) {
argParseError = true;
console.warn(`[agent] Malformed tool args for ${toolName} (repair failed):`, toolCall.function.arguments?.substring(0, 200));
}
}
sendSSE(res, "tool_call_start", {
id: toolCall.id,
name: toolName,
arguments: toolCall.function.arguments,
});
let toolOutput: string;
let isError = false;
// If JSON args were malformed, skip execution and tell LLM to fix
if (argParseError) {
toolOutput = `Error: Your tool call arguments for '${toolName}' contained malformed JSON. The raw arguments were: ${(toolCall.function.arguments || "").substring(0, 500)}. Please fix the JSON and try again.`;
isError = true;
} else try {
// ─── Pre-tool hooks (matches original HookRunner.run_pre_tool_use) ──
const preHookResult = await runPreToolHooks(toolName, sessionId, toolArgs, workDir);
if (!preHookResult.allowed) {
// Hook denied the tool execution (exit code 2 = deny)
toolOutput = preHookResult.message || `Tool '${toolName}' was denied by pre-tool hook`;
isError = true;
sendSSE(res, "permission_denied", {
toolName,
toolCallId: toolCall.id,
reason: toolOutput,
needsPrompt: false,
});
} else {
// Execute the tool with the correct working directory
const toolResult = await executeTool(toolName, toolArgs, sessionId, workDir);
if (toolResult.isError && toolResult.output.includes("needs one-time approval")) {
sendSSE(res, "permission_prompt", {
toolName,
toolCallId: toolCall.id,
reason: toolResult.output,
});
}
toolOutput = toolResult.output;
isError = toolResult.isError || false;
// Merge pre-hook feedback (matches original merge_hook_feedback)
if (preHookResult.message) {
toolOutput = mergeHookFeedback([preHookResult.message], toolOutput, false);
}
// ─── Post-tool hooks (matches original HookRunner.run_post_tool_use) ──
const postHookResult = await runPostToolHooks(toolName, sessionId, toolResult, workDir);
toolOutput = postHookResult.output;
isError = postHookResult.isError || false;
}
} catch (toolExecError: any) {
// ─── Bug #3 fix: Tool exception → error result for LLM ──────
// Original claw-code: tool errors become tool results, NOT loop breaks.
// The LLM sees the error and can try a different approach.
console.error(`[agent] Tool '${toolName}' threw exception:`, toolExecError.message);
toolOutput = `Tool execution error: ${toolExecError.message}`;
isError = true;
}
// No error classification or guidance injection.
// The model receives raw error output and decides how to handle it.
// This matches the original claw-code behavior.
sendSSE(res, "tool_result", {
toolCallId: toolCall.id,
toolName,
output: toolOutput,
isError,
durationMs: 0,
});
// ─── Special SSE events for interactive tools ────────────────
// SendUserMessage / Brief: emit SSE for frontend display but DO NOT break the loop.
// Original claw-code does NOT stop on SendUserMessage — the model can send
// progress updates ("checking...", "found vulnerability...") AND continue working.
// Breaking here was the #1 cause of the agent stopping mid-task.
if ((toolName === "SendUserMessage" || toolName === "Brief" || toolName === "ask_user") && !isError) {
sendSSE(res, "assistant_message", {
message: toolArgs.message || toolArgs.question || "",
attachments: toolArgs.attachments || [],
});
}
// Plan/Todo tools: emit plan state updates
if (["TodoWrite", "plan_create", "plan_update", "enter_plan_mode", "exit_plan_mode"].includes(toolName)) {
const updatedPlan = getPlanMode(sessionId);
sendSSE(res, "plan_update", {
active: updatedPlan.active,
steps: updatedPlan.steps,
});
}
// Add tool result to conversation for the LLM to process
const toolResultMsg: AgentMessage = {
role: "tool",
content: toolOutput,
tool_call_id: toolCall.id,
name: toolName,
};
conversationMessages.push(toolResultMsg);
toolResultMessages.push(toolResultMsg);
}
// No consecutive error detection — the model handles errors naturally.
// MAX_ITERATIONS (200) is the ultimate safety net.
// SendUserMessage does NOT break the loop (matches original).
// ─── Proactive auto-compact check ─────────────────────────────
// Check if conversation is approaching context window limit and compact proactively
const estimatedTokens = estimateConversationTokens(conversationMessages);
// contextWindow already computed above (line 397)
const contextUsagePercent = Math.round((estimatedTokens / contextWindow) * 100);
// Emit context usage SSE for frontend tracking
sendSSE(res, "context_usage", {
estimatedTokens,
contextWindow,
usagePercent: contextUsagePercent,
messageCount: conversationMessages.length,
});
// Proactive compaction at 80% context usage
if (contextUsagePercent >= 80) {
console.log(`[agent] Context at ${contextUsagePercent}% — proactive auto-compact`);
sendSSE(res, "status", {
status: "compacting",
message: `Context at ${contextUsagePercent}% — auto-compacting to free space...`,
});
try {
const session = agentMessagesToSession(conversationMessages);
// LLM-based summarization for proactive compaction
const llmFetchProactive = async (msgs: Array<{role: string; content: string}>) => {
const summaryResp = await fetch(apiConfig.url, {
method: "POST",
headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` },
body: JSON.stringify({
model: apiConfig.model,
messages: msgs,
max_tokens: 2000,
temperature: 0.3,
stream: false,
}),
});
if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`);
const json = await summaryResp.json();
return json.choices?.[0]?.message?.content || "";
};
const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetchProactive);
if (compactResult.removedMessageCount > 0) {
const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession);
conversationMessages.length = 0;
// CRITICAL: Re-prepend original system prompt before compacted summary.
conversationMessages.push({ role: "system", content: systemPrompt });
conversationMessages.push(...compactedAgentMessages);
// Inject current todo/plan state so the agent doesn't lose its plan after compaction
const todoState = (() => {
try {
const executor = require("../tools/executor");
const plan = executor.getPlanMode(sessionId);
const todos = executor.todoLists?.get?.(sessionId) || [];
let state = "";
if (todos.length > 0) {
state += "\n\n[PRESERVED TODO LIST]\n" + todos.map((t: any, i: number) => {
const icon = t.status === "completed" ? "\u2713" : t.status === "in_progress" ? "\u25cf" : "\u25cb";
return ` ${icon} ${i + 1}. ${t.content} [${t.status}]`;
}).join("\n");
}
if (plan?.active && plan.steps?.length > 0) {
state += "\n\n[PRESERVED PLAN]\n" + plan.steps.map((s: any) => {
const icon = s.status === "done" ? "\u2713" : s.status === "in_progress" ? "\u25cf" : "\u25a1";
return ` ${icon} ${s.id}. ${s.text} [${s.status}]`;
}).join("\n");
}
return state;
} catch { return ""; }
})();
if (todoState) {
// Append todo state to the last user/system message so the model sees it
const lastMsg = conversationMessages[conversationMessages.length - 1];
if (lastMsg && typeof lastMsg.content === "string") {
lastMsg.content += todoState;
}
}
sendSSE(res, "auto_compact", {
removedCount: compactResult.removedMessageCount,
keptCount: conversationMessages.length,
summary: compactResult.formattedSummary,
});
console.log(`[agent] Proactive compact: removed ${compactResult.removedMessageCount} messages`);
}
} catch (compactErr: any) {
console.error(`[agent] Proactive compact failed (non-fatal):`, compactErr.message);
}
}
// ─── Buddy events SSE ────────────────────────────────────────────
// Emit buddy_event for each tool call so frontend can award XP
for (const toolCall of result.toolCalls) {
const tn = toolCall.function.name;
sendSSE(res, "buddy_event", {
type: "tool_call",
toolName: tn,
iteration: iterations,
});
// Special buddy events for file creation
if (tn === "write_file" || tn === "create_file") {
sendSSE(res, "buddy_event", {
type: "file_created",
toolName: tn,
iteration: iterations,
});
}
}
// Continue the loop — LLM will see tool results and decide next action
sendSSE(res, "status", {
status: "thinking",
message: `Processing tool results (iteration ${iterations}, context: ${contextUsagePercent}%)...`,
});
} catch (error: any) {
// ─── Bug #2 fix: Distinguish fatal vs transient errors ────────
// Only AbortError and unrecoverable errors should break the loop.
// Stream/fetch errors are already handled above with retry logic.
if (error.name === "AbortError" || signal?.aborted) {
sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" });
break;
}
// For other errors, log and break (these are truly unexpected)
console.error(`[agent] Unexpected error in agent loop:`, error.message, error.stack);
sendSSE(res, "error", { message: error.message || "Unknown error" });
break;
}
}
if (iterations >= MAX_ITERATIONS) {
sendSSE(res, "error", { message: `Maximum iterations (${MAX_ITERATIONS}) reached. Use /compact to reduce context and continue.` });
}
// ─── Buddy: session_completed event ─────────────────────────────────
// Emit session_completed so Buddy can award XP for finishing a turn
sendSSE(res, "buddy_event", {
type: "session_completed",
iterations,
toolCallCount: toolResultMessages.length,
});
// Calculate cost using UsageTracker (matches original)
const cumulativeUsage = usageTracker.cumulativeUsage();
const modelPricing = pricingForModel(apiConfig.model) ?? defaultSonnetTierPricing();
const costEstimate = estimateCostUsdWithPricing(cumulativeUsage, modelPricing);
totalCost = totalCostUsd(costEstimate);
// Emit usage summary lines (matches original summary_lines_for_model)
const usageSummary = summaryLinesForModel(cumulativeUsage, "session", apiConfig.model);
sendSSE(res, "usage", {
promptTokens: totalPromptTokens,
completionTokens: totalCompletionTokens,
totalTokens: totalPromptTokens + totalCompletionTokens,
cost: totalCost,
cacheCreationTokens: cumulativeUsage.cache_creation_input_tokens,
cacheReadTokens: cumulativeUsage.cache_read_input_tokens,
usageSummary,
turns: usageTracker.turns(),
formattedCost: formatUsd(totalCost),
});
return {
finalMessages: conversationMessages.filter((m) => m.role !== "system"),
totalPromptTokens,
totalCompletionTokens,
totalCost,
model: apiConfig.model,
};
}
/**
* Process a streaming response from the LLM API (OpenAI-compatible SSE format)
*/
async function processStream(
response: globalThis.Response,
res: Response,
signal?: AbortSignal
): Promise<{
content: string;
toolCalls: Array<{
id: string;
type: "function";
function: { name: string; arguments: string };
}>;
usage?: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number };
}> {
const reader = response.body?.getReader();
if (!reader) throw new Error("No response body");
const decoder = new TextDecoder();
let content = "";
const toolCalls: Map<
number,
{ id: string; type: "function"; function: { name: string; arguments: string } }
> = new Map();
let usage: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number } | undefined;
let buffer = "";
try {
while (true) {
if (signal?.aborted) break;
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
// Process complete SSE lines
const lines = buffer.split("\n");
buffer = lines.pop() || "";
for (const line of lines) {
if (!line.startsWith("data: ")) continue;
const data = line.slice(6).trim();
if (data === "[DONE]") continue;
try {
const chunk = JSON.parse(data);
const delta = chunk.choices?.[0]?.delta;
// Detect API errors returned inside the SSE stream (e.g. DeepInfra "Operation not allowed")
if (chunk.error) {
const errMsg = chunk.error.message || chunk.error.type || JSON.stringify(chunk.error);
console.error(`[agent] API error in stream: ${errMsg}`);
throw new Error(`API error in stream: ${errMsg}`);
}
if (!delta) {
if (chunk.usage) {
usage = {
prompt_tokens: chunk.usage.prompt_tokens || 0,
completion_tokens: chunk.usage.completion_tokens || 0,
cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0,
cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0,
};
}
continue;
}
// Reasoning/thinking content (Qwen3 Thinking, DeepSeek-R1)
// These models return reasoning in delta.reasoning_content before the actual response
if (delta.reasoning_content) {
sendSSE(res, "thinking_delta", { text: delta.reasoning_content });
}
// Text content streaming
if (delta.content) {
content += delta.content;
sendSSE(res, "text_delta", { text: delta.content });
}
// Tool call streaming
if (delta.tool_calls) {
for (const tc of delta.tool_calls) {
const idx = tc.index ?? 0;
if (!toolCalls.has(idx)) {
toolCalls.set(idx, {
id: tc.id || `call_${idx}_${Date.now()}`,
type: "function",
function: { name: tc.function?.name || "", arguments: "" },
});
}
const existing = toolCalls.get(idx)!;
if (tc.id) existing.id = tc.id;
if (tc.function?.name) existing.function.name = tc.function.name;
if (tc.function?.arguments) {
existing.function.arguments += tc.function.arguments;
sendSSE(res, "tool_call_delta", {
id: existing.id,
name: existing.function.name,
arguments: tc.function.arguments,
});
}
}
}
// Usage info
if (chunk.usage) {
usage = {
prompt_tokens: chunk.usage.prompt_tokens || 0,
completion_tokens: chunk.usage.completion_tokens || 0,
cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0,
cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0,
};
}
} catch (parseErr: any) {
// Re-throw API errors (these are NOT malformed JSON — they're real errors)
if (parseErr?.message?.startsWith('API error in stream:')) {
throw parseErr;
}
// Skip genuinely malformed JSON chunks (partial SSE data, etc.)
}
}
}
} finally {
reader.releaseLock();
}
// Process remaining buffer (last line without trailing \n)
if (buffer.trim() && buffer.startsWith("data: ")) {
const data = buffer.slice(6).trim();
if (data !== "[DONE]") {
try {
const chunk = JSON.parse(data);
const delta = chunk.choices?.[0]?.delta;
if (delta?.content) {
content += delta.content;
sendSSE(res, "text_delta", { text: delta.content });
}
if (delta?.tool_calls) {
for (const tc of delta.tool_calls) {
const idx = tc.index ?? 0;
if (!toolCalls.has(idx)) {
toolCalls.set(idx, {
id: tc.id || `call_${idx}_${Date.now()}`,
type: "function",
function: { name: tc.function?.name || "", arguments: "" },
});
}
const existing = toolCalls.get(idx)!;
if (tc.id) existing.id = tc.id;
if (tc.function?.name) existing.function.name = tc.function.name;
if (tc.function?.arguments) existing.function.arguments += tc.function.arguments;
}
}
if (chunk.usage) {
usage = {
prompt_tokens: chunk.usage.prompt_tokens || 0,
completion_tokens: chunk.usage.completion_tokens || 0,
cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0,
cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0,
};
}
// Check finish_reason for truncation
const finishReason = chunk.choices?.[0]?.finish_reason;
if (finishReason === "length") {
console.warn("[agent] Response truncated (finish_reason=length) — tool call args may be incomplete");
}
} catch { /* skip malformed */ }
}
}
// Original claw-code retries on empty response instead of throwing.
if (content.length === 0 && toolCalls.size === 0) {
console.warn("[agent] LLM returned empty response — will be retried by agent loop");
}
return {
content,
toolCalls: Array.from(toolCalls.values()),
usage,
};
}
/**
* Estimate cost based on model and token counts
*/
function estimateCost(model: string, promptTokens: number, completionTokens: number): number {
// Pricing per 1M tokens — aligned with original claw-code model registry
const pricing: Record<string, { input: number; output: number }> = {
// Claw API / Anthropic
"claude-opus-4-6": { input: 15.00, output: 75.00 },
"claude-sonnet-4-6": { input: 3.00, output: 15.00 },
"claude-haiku-4-5-20251213": { input: 0.80, output: 4.00 },
// xAI Grok
"grok-3": { input: 3.00, output: 15.00 },
"grok-3-mini": { input: 0.30, output: 0.50 },
"grok-2": { input: 2.00, output: 10.00 },
// OpenAI
"gpt-5.4": { input: 2.50, output: 15.00 },
"gpt-5.4-mini": { input: 0.40, output: 1.60 },
"gpt-5.3-codex": { input: 2.50, output: 10.00 },
"gpt-4.1": { input: 2.00, output: 8.00 },
"gpt-4.1-mini": { input: 0.40, output: 1.60 },
"o3": { input: 10.00, output: 40.00 },
"o4-mini": { input: 1.10, output: 4.40 },
// HuggingFace Inference API (free tier = $0, Pro tier = included in subscription)
"XiaomiMiMo/MiMo-V2-Flash": { input: 0.00, output: 0.00 },
"Qwen/Qwen3-Coder-Next": { input: 0.00, output: 0.00 },
"Qwen/Qwen3-8B": { input: 0.00, output: 0.00 },
"Qwen/Qwen3-Coder-30B-A3B-Instruct": { input: 0.00, output: 0.00 },
"meta-llama/Llama-3.3-70B-Instruct": { input: 0.00, output: 0.00 },
"deepseek-ai/DeepSeek-V3.2": { input: 0.00, output: 0.00 },
"deepseek-ai/DeepSeek-R1": { input: 0.00, output: 0.00 },
// OpenRouter variants
"anthropic/claude-opus-4-6": { input: 15.00, output: 75.00 },
"anthropic/claude-sonnet-4-6": { input: 3.00, output: 15.00 },
"google/gemini-2.5-pro": { input: 1.25, output: 10.00 },
"google/gemini-2.5-flash": { input: 0.15, output: 0.60 },
};
const rates = pricing[model] || { input: 1.00, output: 3.00 };
return (promptTokens * rates.input + completionTokens * rates.output) / 1_000_000;
}