File size: 19,654 Bytes
cb4fc00
58360ec
cb4fc00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e8f3b8
 
 
cb4fc00
 
 
 
4e8f3b8
 
 
 
 
 
cb4fc00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be3f641
cb4fc00
 
2aaa293
 
 
cb4fc00
 
 
 
 
2aaa293
 
 
 
 
 
 
 
 
 
 
 
cb4fc00
 
 
 
 
 
 
 
2aaa293
 
 
 
 
cb4fc00
 
 
 
 
2aaa293
cb4fc00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2aaa293
 
 
 
 
 
 
 
 
 
 
 
 
 
cb4fc00
2aaa293
cb4fc00
 
2aaa293
cb4fc00
 
 
be3f641
cb4fc00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8fa7bb6
 
 
 
cb4fc00
 
8fa7bb6
 
cb4fc00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01de83c
cb4fc00
 
 
 
 
 
 
 
 
 
 
 
0426577
fa48788
477e0cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68809c8
 
 
 
 
 
 
0426577
cb4fc00
477e0cf
 
 
 
 
 
cb4fc00
477e0cf
 
 
 
 
 
 
 
cb4fc00
477e0cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5424aa
 
cb4fc00
477e0cf
d65f551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01de83c
78c65dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d65f551
 
 
 
 
0426577
d65f551
9248162
d65f551
 
 
 
 
 
68809c8
 
 
 
 
 
 
0426577
d65f551
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5424aa
 
d65f551
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
import { createServerFn } from "@tanstack/react-start";
import { fetchAIWithFallback, getAIConfig } from "./ai-config.server";
import { z } from "zod";

const InputSchema = z.object({
  diff: z.string().min(1).max(200_000),
  failureDescription: z.string().min(1).max(5_000),
});

export type Suspect = {
  filePath: string;
  functionName: string | null;
  lineStart: number;
  lineEnd: number;
  confidence: "high" | "medium" | "low";
  mechanism: string;
  changeSummary: string;
  beforeSnippet: string | null;
  afterSnippet: string | null;
};

export type AuditEntry = { token: string; real: string; occurrences: number };
export type AuditSample = { original: string; sanitized: string };

export type DebugResult = {
  suspects: Suspect[];
  summary: string;
  sanitizationStats: { identifiersTokenized: number; commentsStripped: number; secretsBlocked: number };
  audit: {
    tokenMap: AuditEntry[];               // real -> token, sorted by occurrence
    redactedComments: string[];           // up to 20 stripped comment lines (already comment-only, safe to show)
    secretMatches: { pattern: string; replaced: string }[]; // never the real secret
    sample: AuditSample;                  // first ~30 lines: original vs sanitized side-by-side
  };
};

// ───────────────────────── IP Shield: sanitizer ─────────────────────────
// Strips comments + secrets, replaces identifiers with fn_NNNN tokens.
// Token map stays server-side and is used to restore real names afterwards.

const SECRET_PATTERNS = [
  /(?:api[_-]?key|secret|token|password|bearer)\s*[:=]\s*["']?[A-Za-z0-9_\-]{8,}["']?/gi,
  /sk-[A-Za-z0-9]{20,}/g,
  /eyJ[A-Za-z0-9_\-]{20,}\.[A-Za-z0-9_\-]{20,}\.[A-Za-z0-9_\-]{10,}/g, // JWT
];

const RESERVED = new Set([
  "if","else","for","while","return","def","class","import","from","const","let","var",
  "function","async","await","try","catch","throw","new","null","true","false","this",
  "self","int","str","bool","void","public","private","static","export","default",
  "diff","git","index","main","feature","a","b","fix","add","remove","update",
]);

export function sanitize(diff: string) {
  const tokenMap = new Map<string, string>();      // real -> token
  const reverseMap = new Map<string, string>();    // token -> real
  const occurrences = new Map<string, number>();   // real -> count
  const redactedComments: string[] = [];
  const secretMatches: { pattern: string; replaced: string }[] = [];
  let counter = 1;
  let commentsStripped = 0;
  let secretsBlocked = 0;

  // Strip secrets first
  const PATTERN_LABELS = ["api-key/secret/token assignment", "OpenAI key (sk-…)", "JWT bearer"];
  SECRET_PATTERNS.forEach((re, idx) => {
    diff = diff.replace(re, (match) => {
      secretsBlocked++;
      // Only record a safe length-summary, NEVER the secret itself
      secretMatches.push({
        pattern: PATTERN_LABELS[idx] ?? "secret",
        replaced: `[REDACTED ${match.length} chars]`,
      });
      return "[SECRET_REDACTED]";
    });
  });

  // Strip comments line-wise (#, //, /* */)
  const lines = diff.split("\n").map((line) => {
    const original = line;
    const stripped = line
      .replace(/(^|\s)#.*$/g, "$1")
      .replace(/\/\/.*$/g, "")
      .replace(/\/\*[\s\S]*?\*\//g, "");
    if (stripped !== original) {
      commentsStripped++;
      const removed = original.slice(stripped.length).trim();
      if (removed && redactedComments.length < 20) redactedComments.push(removed);
    }
    return stripped;
  });

  const tokenize = (name: string): string => {
    if (RESERVED.has(name) || /^\d+$/.test(name) || name.length < 3) return name;
    occurrences.set(name, (occurrences.get(name) ?? 0) + 1);
    let tok = tokenMap.get(name);
    if (!tok) {
      tok = `fn_${String(counter++).padStart(4, "0")}`;
      tokenMap.set(name, tok);
      reverseMap.set(tok, name);
    }
    return tok;
  };

  // Tokenize identifiers (simple heuristic: snake_case / camelCase words)
  const sanitizedLines = lines.map((line) => {
    if (line.startsWith("diff --git") || line.startsWith("index ") || line.startsWith("@@")) {
      return line.replace(/[A-Za-z_][A-Za-z0-9_]{2,}/g, (m) => tokenize(m));
    }
    return line.replace(/[A-Za-z_][A-Za-z0-9_]{2,}/g, (m) => tokenize(m));
  });

  const sanitized = sanitizedLines.join("\n");

  // Build audit token map sorted by occurrence (most-used first)
  const auditMap: AuditEntry[] = Array.from(tokenMap.entries())
    .map(([real, token]) => ({ token, real, occurrences: occurrences.get(real) ?? 0 }))
    .sort((a, b) => b.occurrences - a.occurrences);

  // Sample: first 30 non-empty lines, original (post-secret-redaction) vs sanitized
  const SAMPLE_LINES = 30;
  const sample: AuditSample = {
    original: diff.split("\n").slice(0, SAMPLE_LINES).join("\n"),
    sanitized: sanitized.split("\n").slice(0, SAMPLE_LINES).join("\n"),
  };

  return {
    sanitized,
    reverseMap,
    stats: { identifiersTokenized: tokenMap.size, commentsStripped, secretsBlocked },
    audit: { tokenMap: auditMap, redactedComments, secretMatches, sample },
  };
}

export function restore(text: string, reverseMap: Map<string, string>): string {
  return text.replace(/fn_\d{4}/g, (tok) => reverseMap.get(tok) ?? tok);
}

// ───────────────────────── Diff parser (file path + line numbers) ─────────────────────────
// Parses unified diff so we can report real file paths and the exact added-line range
// for each hunk. The AI returns a hunk index + reasoning; we look up the location here.

type Hunk = {
  filePath: string;
  hunkIndex: number;
  newStart: number;       // first new-file line of the hunk
  newEnd: number;         // last new-file line that was added/changed
  addedLines: string[];
  removedLines: string[];
  functionContext: string | null;
};

function parseDiff(diff: string): Hunk[] {
  const hunks: Hunk[] = [];
  let currentFile: string | null = null;
  let hunkIndex = 0;
  const lines = diff.split("\n");

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i];
    const gitMatch = line.match(/^diff --git a\/(.+?) b\/(.+)$/);
    if (gitMatch) { currentFile = gitMatch[2]; continue; }
    const plusFile = line.match(/^\+\+\+ (?:b\/)?(.+?)(?:\s|$)/);
    if (plusFile && plusFile[1] !== "/dev/null") { currentFile = plusFile[1]; continue; }

    const hunkHeader = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@(.*)$/);
    if (hunkHeader) {
      if (!currentFile) currentFile = "unknown";
      const newStart = parseInt(hunkHeader[1], 10);
      const newCount = parseInt(hunkHeader[2] ?? "1", 10);
      const fnCtx = hunkHeader[3].trim() || null;

      const added: string[] = [];
      const removed: string[] = [];
      let cursor = newStart;
      let lastChanged = newStart;

      for (let j = i + 1; j < lines.length; j++) {
        const l = lines[j];
        if (l.startsWith("@@") || l.startsWith("diff --git")) break;
        if (l.startsWith("+") && !l.startsWith("+++")) { added.push(l.slice(1)); lastChanged = cursor; cursor++; }
        else if (l.startsWith("-") && !l.startsWith("---")) { removed.push(l.slice(1)); }
        else { cursor++; }
      }

      hunks.push({
        filePath: currentFile,
        hunkIndex: hunkIndex++,
        newStart,
        newEnd: Math.max(newStart, lastChanged),
        addedLines: added,
        removedLines: removed,
        functionContext: fnCtx,
      });
    }
  }
  return hunks;
}

// ───────────────────────── AI tool schema ─────────────────────────
const analysisTool = {
  type: "function" as const,
  function: {
    name: "submit_root_cause_analysis",
    description: "Submit ranked root-cause suspects derived from a sanitized git diff and a failure description.",
    parameters: {
      type: "object",
      properties: {
        summary: { type: "string", description: "1-2 sentence overall verdict." },
        suspects: {
          type: "array",
          minItems: 1,
          items: {
            type: "object",
            properties: {
              hunkIndex: { type: "number", description: "Index into the provided HUNKS list (0-based)." },
              functionToken: { type: "string", description: "Anonymized function token (e.g. fn_0019), or empty string." },
              confidence: { type: "string", enum: ["high", "medium", "low"] },
              mechanism: { type: "string", description: "Plain-English explanation of why this change causes the failure." },
              changeSummary: { type: "string", description: "Short label of the change (e.g. 'Threshold change 15β†’16')." },
            },
            required: ["hunkIndex", "functionToken", "confidence", "mechanism", "changeSummary"],
            additionalProperties: false,
          },
        },
      },
      required: ["summary", "suspects"],
      additionalProperties: false,
    },
  },
};

const SYSTEM_PROMPT = `You are BranchDebug Bot, a code-aware root-cause analyzer powered by Qwen3 reasoning. Inputs are:
1. A SANITIZED unified git diff where real identifiers have been replaced with opaque tokens like fn_0019.
2. A natural-language description of an observed failure.
3. A numbered list of HUNKS (filePath, lineRange, function context).

For each hunk that plausibly caused the failure, return a suspect entry with:
- hunkIndex (the number from the HUNKS list)
- confidence (high/medium/low) β€” only mark "high" when the mechanism directly explains the failure
- mechanism β€” concrete cause-and-effect
- changeSummary β€” a short label

Rank by likelihood. Be conservative; if a hunk is unrelated, do not include it. Always call submit_root_cause_analysis.`;

export async function analyzeDiff(diff: string, failureDescription: string, userId?: string | null): Promise<DebugResult> {
  getAIConfig();

  const { sanitized, reverseMap, stats, audit } = sanitize(diff);
  const hunks = parseDiff(diff);
  if (hunks.length === 0) throw new Error("No hunks found in diff. Make sure the input is a unified git diff.");

  const sanitizedHunks = parseDiff(sanitized);
  const hunkList = sanitizedHunks.map((h, i) => {
    const realLoc = hunks[i];
    const range = realLoc ? `lines ${realLoc.newStart}-${realLoc.newEnd}` : `lines ?`;
    const ctx = h.functionContext ? ` in ${h.functionContext}` : "";
    const added = h.addedLines.slice(0, 8).map((l) => `+ ${l}`).join("\n");
    const removed = h.removedLines.slice(0, 8).map((l) => `- ${l}`).join("\n");
    return `[${i}] ${h.filePath} (${range})${ctx}\n${removed}\n${added}`;
  }).join("\n\n");

  const userContent = `FAILURE DESCRIPTION (sanitized):\n${sanitize(failureDescription).sanitized}\n\nHUNKS:\n${hunkList}\n\nFULL SANITIZED DIFF:\n${sanitized.slice(0, 40_000)}`;

  const resp = await fetchAIWithFallback(JSON.stringify({
    messages: [
      { role: "system", content: SYSTEM_PROMPT },
      { role: "user", content: userContent },
    ],
    tools: [analysisTool],
    tool_choice: { type: "function", function: { name: "submit_root_cause_analysis" } },
  }), "google/gemini-2.5-pro", "debugBranch", userId);

  if (!resp.ok) {
    const text = await resp.text();
    if (resp.status === 429) throw new Error("Rate limit reached. Try again shortly.");
    if (resp.status === 402) throw new Error("AI credits exhausted. Add credits in Workspace > Usage.");
    throw new Error(`AI gateway error ${resp.status}: ${text.slice(0, 300)}`);
  }

  const json = await resp.json();
  const toolCall = json.choices?.[0]?.message?.tool_calls?.[0];
  if (!toolCall?.function?.arguments) throw new Error("AI did not return structured analysis.");

  const parsed = JSON.parse(toolCall.function.arguments) as {
    summary: string;
    suspects: { hunkIndex: number; functionToken: string; confidence: "high" | "medium" | "low"; mechanism: string; changeSummary: string }[];
  };

  const suspects: Suspect[] = parsed.suspects
    .filter((s) => hunks[s.hunkIndex])
    .map((s) => {
      const h = hunks[s.hunkIndex];
      return {
        filePath: h.filePath,
        functionName: s.functionToken ? restore(s.functionToken, reverseMap) : (h.functionContext ?? null),
        lineStart: h.newStart,
        lineEnd: h.newEnd,
        confidence: s.confidence,
        mechanism: restore(s.mechanism, reverseMap),
        changeSummary: restore(s.changeSummary, reverseMap),
        beforeSnippet: h.removedLines.slice(0, 6).join("\n") || null,
        afterSnippet: h.addedLines.slice(0, 6).join("\n") || null,
      };
    })
    .sort((a, b) => {
      const order = { high: 0, medium: 1, low: 2 } as const;
      return order[a.confidence] - order[b.confidence];
    });

  return {
    summary: restore(parsed.summary, reverseMap),
    suspects,
    sanitizationStats: stats,
    audit,
  };
}

export const debugBranch = createServerFn({ method: "POST" })
  .inputValidator((d: unknown) => InputSchema.parse(d))
  .handler(async ({ data }): Promise<DebugResult> => {
    return analyzeDiff(data.diff, data.failureDescription);
  });

// ───────────────────────── Snippet mode (no diff) ─────────────────────────
// For users pasting a raw code snippet instead of a unified diff.
// We still run the IP Shield sanitizer, then ask the AI to locate bugs by line.

const SnippetInputSchema = z.object({
  snippet: z.string().min(1).max(200_000),
  failureDescription: z.string().min(1).max(5_000),
  language: z.string().max(40).optional(),
});

const snippetTool = {
  type: "function" as const,
  function: {
    name: "submit_snippet_analysis",
    description: "Submit ranked bug suspects for a raw code snippet (no diff).",
    parameters: {
      type: "object",
      properties: {
        summary: { type: "string" },
        suspects: {
          type: "array",
          minItems: 1,
          items: {
            type: "object",
            properties: {
              line: { type: "number", description: "1-based line number in the snippet." },
              functionToken: { type: "string", description: "Anonymized function/block token, or empty string." },
              confidence: { type: "string", enum: ["high", "medium", "low"] },
              mechanism: { type: "string" },
              changeSummary: { type: "string", description: "Short label of the suspicious pattern." },
              codeFragment: { type: "string", description: "The exact suspect line(s), anonymized." },
            },
            required: ["line", "functionToken", "confidence", "mechanism", "changeSummary", "codeFragment"],
            additionalProperties: false,
          },
        },
      },
      required: ["summary", "suspects"],
      additionalProperties: false,
    },
  },
};

const SNIPPET_SYSTEM = `You are BranchDebug Bot in SNIPPET mode, powered by Qwen3 reasoning β€” an expert code reviewer with deep knowledge of every mainstream programming language (Python, TypeScript/JavaScript, C/C++, C#, Java, Kotlin, Swift, Go, Rust, Ruby, PHP, Scala, Elixir, Haskell, Lua, R, Dart, SQL, Bash, HTML/CSS, YAML/JSON/TOML, and more).

The user pasted a raw code snippet (not a diff). Identifiers are tokenized as fn_NNNN; treat them as opaque names. Carefully analyze the snippet and find ANY of the following classes of bugs that match the failure description (or are obvious defects, even if not described):

  β€’ Syntax errors (missing colons, brackets, semicolons, quotes, indentation)
  β€’ Type errors / wrong argument count / missing or extra parameters
  β€’ Off-by-one errors, bad thresholds, wrong comparison operators
  β€’ Null / undefined / None / nil dereferences
  β€’ Uninitialized variables, scope/closure mistakes, shadowing
  β€’ Logic errors, wrong control flow, unreachable code, infinite loops
  β€’ Race conditions, async/await misuse, unhandled promise rejections
  β€’ Resource leaks (unclosed files, connections, listeners)
  β€’ Security issues (SQL injection, XSS, path traversal, weak crypto, secrets)
  β€’ Performance pitfalls (N+1 queries, quadratic loops on large input)
  β€’ API misuse, deprecated calls, framework-specific anti-patterns
  β€’ Incorrect return values, missing return statements

For EACH defect you find, return one suspect with the 1-based line number from the snippet, a confidence rating, and a clear cause-and-effect mechanism. Be thorough but precise β€” return multiple suspects when there are multiple bugs (e.g. a syntax error AND a wrong argument count). Only mark "high" when the mechanism directly explains the failure or is an obvious defect. Always call submit_snippet_analysis.`;

export async function analyzeSnippet(
  snippet: string,
  failureDescription: string,
  language?: string,
  userId?: string | null,
): Promise<DebugResult> {
  getAIConfig();

  const { sanitized, reverseMap, stats, audit } = sanitize(snippet);
  const numbered = sanitized.split("\n").map((l, i) => `${String(i + 1).padStart(4, " ")} | ${l}`).join("\n");

  const userContent = `LANGUAGE: ${language || "auto-detect"}\n\nFAILURE DESCRIPTION (sanitized):\n${sanitize(failureDescription).sanitized}\n\nCODE SNIPPET (line-numbered, sanitized):\n${numbered.slice(0, 40_000)}`;

  const resp = await fetchAIWithFallback(JSON.stringify({
    messages: [
      { role: "system", content: SNIPPET_SYSTEM },
      { role: "user", content: userContent },
    ],
    tools: [snippetTool],
    tool_choice: { type: "function", function: { name: "submit_snippet_analysis" } },
  }), "google/gemini-2.5-pro", "debugSnippet", userId);

  if (!resp.ok) {
    const text = await resp.text();
    if (resp.status === 429) throw new Error("Rate limit reached. Try again shortly.");
    if (resp.status === 402) throw new Error("AI credits exhausted. Add credits in Workspace > Usage.");
    throw new Error(`AI gateway error ${resp.status}: ${text.slice(0, 300)}`);
  }

  const json = await resp.json();
  const toolCall = json.choices?.[0]?.message?.tool_calls?.[0];
  if (!toolCall?.function?.arguments) throw new Error("AI did not return structured analysis.");

  const parsed = JSON.parse(toolCall.function.arguments) as {
    summary: string;
    suspects: { line: number; functionToken: string; confidence: "high" | "medium" | "low"; mechanism: string; changeSummary: string; codeFragment: string }[];
  };

  const snippetLines = snippet.split("\n");
  const suspects: Suspect[] = parsed.suspects.map((s) => ({
    filePath: language ? `snippet.${language}` : "snippet",
    functionName: s.functionToken ? restore(s.functionToken, reverseMap) : null,
    lineStart: s.line,
    lineEnd: s.line,
    confidence: s.confidence,
    mechanism: restore(s.mechanism, reverseMap),
    changeSummary: restore(s.changeSummary, reverseMap),
    beforeSnippet: null,
    afterSnippet: snippetLines[s.line - 1] ?? restore(s.codeFragment, reverseMap),
  })).sort((a, b) => {
    const order = { high: 0, medium: 1, low: 2 } as const;
    return order[a.confidence] - order[b.confidence];
  });

  return {
    summary: restore(parsed.summary, reverseMap),
    suspects,
    sanitizationStats: stats,
    audit,
  };
}

export const debugSnippet = createServerFn({ method: "POST" })
  .inputValidator((d: unknown) => SnippetInputSchema.parse(d))
  .handler(async ({ data }): Promise<DebugResult> => {
    return analyzeSnippet(data.snippet, data.failureDescription, data.language);
  });