ruvector-fixed / dist /core /diff-embeddings.js
Archie
Fix dimension/dimensions bug and positional insert/search args
40d7073
"use strict";
/**
* Diff Embeddings - Semantic encoding of git diffs
*
* Generates embeddings for code changes to enable:
* - Change classification (feature, bugfix, refactor)
* - Similar change detection
* - Risk assessment
* - Review prioritization
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.parseDiff = parseDiff;
exports.classifyChange = classifyChange;
exports.calculateRiskScore = calculateRiskScore;
exports.analyzeFileDiff = analyzeFileDiff;
exports.getCommitDiff = getCommitDiff;
exports.getStagedDiff = getStagedDiff;
exports.getUnstagedDiff = getUnstagedDiff;
exports.analyzeCommit = analyzeCommit;
exports.findSimilarCommits = findSimilarCommits;
const child_process_1 = require("child_process");
const onnx_embedder_1 = require("./onnx-embedder");
/**
* Parse a unified diff into hunks
*/
function parseDiff(diff) {
const hunks = [];
const lines = diff.split('\n');
let currentFile = '';
let currentHunk = null;
for (const line of lines) {
// File header
if (line.startsWith('diff --git')) {
const match = line.match(/diff --git a\/(.+) b\/(.+)/);
if (match) {
currentFile = match[2];
}
}
// Hunk header
if (line.startsWith('@@')) {
if (currentHunk) {
hunks.push(currentHunk);
}
const match = line.match(/@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@/);
if (match) {
currentHunk = {
file: currentFile,
oldStart: parseInt(match[1]),
oldLines: parseInt(match[2] || '1'),
newStart: parseInt(match[3]),
newLines: parseInt(match[4] || '1'),
content: '',
additions: [],
deletions: [],
};
}
}
else if (currentHunk) {
// Content lines
if (line.startsWith('+') && !line.startsWith('+++')) {
currentHunk.additions.push(line.substring(1));
currentHunk.content += line + '\n';
}
else if (line.startsWith('-') && !line.startsWith('---')) {
currentHunk.deletions.push(line.substring(1));
currentHunk.content += line + '\n';
}
else if (line.startsWith(' ')) {
currentHunk.content += line + '\n';
}
}
}
if (currentHunk) {
hunks.push(currentHunk);
}
return hunks;
}
/**
* Classify a change based on patterns
*/
function classifyChange(diff, message = '') {
const lowerMessage = message.toLowerCase();
const lowerDiff = diff.toLowerCase();
// Check message patterns
if (/\b(fix|bug|issue|error|crash|patch)\b/.test(lowerMessage))
return 'bugfix';
if (/\b(feat|feature|add|new|implement)\b/.test(lowerMessage))
return 'feature';
if (/\b(refactor|clean|improve|optimize)\b/.test(lowerMessage))
return 'refactor';
if (/\b(doc|readme|comment|jsdoc)\b/.test(lowerMessage))
return 'docs';
if (/\b(test|spec|coverage)\b/.test(lowerMessage))
return 'test';
if (/\b(config|ci|cd|build|deps)\b/.test(lowerMessage))
return 'config';
// Check diff patterns
if (/\.(md|txt|rst)$/.test(diff))
return 'docs';
if (/\.(test|spec)\.[jt]sx?/.test(diff))
return 'test';
if (/\.(json|ya?ml|toml|ini)$/.test(diff))
return 'config';
// Check content patterns
if (/\bcatch\b|\btry\b|\berror\b/.test(lowerDiff) && /\bfix\b/.test(lowerDiff))
return 'bugfix';
if (/\bfunction\b|\bclass\b|\bexport\b/.test(lowerDiff))
return 'feature';
return 'unknown';
}
/**
* Calculate risk score for a diff
*/
function calculateRiskScore(analysis) {
let risk = 0;
// Size risk
const totalChanges = analysis.totalAdditions + analysis.totalDeletions;
if (totalChanges > 500)
risk += 0.3;
else if (totalChanges > 200)
risk += 0.2;
else if (totalChanges > 50)
risk += 0.1;
// Complexity risk
if (analysis.complexity > 20)
risk += 0.2;
else if (analysis.complexity > 10)
risk += 0.1;
// File type risk
if (analysis.file.includes('auth') || analysis.file.includes('security'))
risk += 0.2;
if (analysis.file.includes('database') || analysis.file.includes('migration'))
risk += 0.15;
if (analysis.file.includes('api') || analysis.file.includes('endpoint'))
risk += 0.1;
// Pattern risk (deletions of error handling, etc.)
for (const hunk of analysis.hunks) {
for (const del of hunk.deletions) {
if (/\bcatch\b|\berror\b|\bvalidat/.test(del))
risk += 0.1;
if (/\bif\b.*\bnull\b|\bundefined\b/.test(del))
risk += 0.05;
}
}
return Math.min(1, risk);
}
/**
* Analyze a single file diff
*/
async function analyzeFileDiff(file, diff, message = '') {
const hunks = parseDiff(diff).filter(h => h.file === file || h.file === '');
const totalAdditions = hunks.reduce((sum, h) => sum + h.additions.length, 0);
const totalDeletions = hunks.reduce((sum, h) => sum + h.deletions.length, 0);
// Calculate complexity (branch keywords in additions)
let complexity = 0;
for (const hunk of hunks) {
for (const add of hunk.additions) {
if (/\bif\b|\belse\b|\bfor\b|\bwhile\b|\bswitch\b|\bcatch\b|\?/.test(add)) {
complexity++;
}
}
}
const category = classifyChange(diff, message);
const analysis = {
file,
hunks,
totalAdditions,
totalDeletions,
complexity,
riskScore: 0,
category,
};
analysis.riskScore = calculateRiskScore(analysis);
// Generate embedding for the diff
if ((0, onnx_embedder_1.isReady)()) {
const diffText = hunks.map(h => h.content).join('\n');
const result = await (0, onnx_embedder_1.embed)(`${category} change in ${file}: ${diffText.substring(0, 500)}`);
analysis.embedding = result.embedding;
}
return analysis;
}
/**
* Get diff for a commit
*/
function getCommitDiff(commitHash = 'HEAD') {
try {
return (0, child_process_1.execSync)(`git show ${commitHash} --format="" 2>/dev/null`, {
encoding: 'utf8',
maxBuffer: 10 * 1024 * 1024,
});
}
catch {
return '';
}
}
/**
* Get diff for staged changes
*/
function getStagedDiff() {
try {
return (0, child_process_1.execSync)('git diff --cached 2>/dev/null', {
encoding: 'utf8',
maxBuffer: 10 * 1024 * 1024,
});
}
catch {
return '';
}
}
/**
* Get diff for unstaged changes
*/
function getUnstagedDiff() {
try {
return (0, child_process_1.execSync)('git diff 2>/dev/null', {
encoding: 'utf8',
maxBuffer: 10 * 1024 * 1024,
});
}
catch {
return '';
}
}
/**
* Analyze a commit
*/
async function analyzeCommit(commitHash = 'HEAD') {
const diff = getCommitDiff(commitHash);
// Get commit metadata
let message = '', author = '', date = '';
try {
const info = (0, child_process_1.execSync)(`git log -1 --format="%s|%an|%aI" ${commitHash} 2>/dev/null`, {
encoding: 'utf8',
}).trim();
[message, author, date] = info.split('|');
}
catch { }
// Parse hunks and group by file
const hunks = parseDiff(diff);
const fileHunks = new Map();
for (const hunk of hunks) {
if (!fileHunks.has(hunk.file)) {
fileHunks.set(hunk.file, []);
}
fileHunks.get(hunk.file).push(hunk);
}
// Analyze each file
const files = [];
for (const [file, fileHunkList] of fileHunks) {
const fileDiff = fileHunkList.map(h => h.content).join('\n');
const analysis = await analyzeFileDiff(file, diff, message);
files.push(analysis);
}
const totalAdditions = files.reduce((sum, f) => sum + f.totalAdditions, 0);
const totalDeletions = files.reduce((sum, f) => sum + f.totalDeletions, 0);
const riskScore = files.length > 0
? files.reduce((sum, f) => sum + f.riskScore, 0) / files.length
: 0;
// Generate commit embedding
let embedding;
if ((0, onnx_embedder_1.isReady)()) {
const commitText = `${message}\n\nFiles changed: ${files.map(f => f.file).join(', ')}\n+${totalAdditions} -${totalDeletions}`;
const result = await (0, onnx_embedder_1.embed)(commitText);
embedding = result.embedding;
}
return {
hash: commitHash,
message,
author,
date,
files,
totalAdditions,
totalDeletions,
riskScore,
embedding,
};
}
/**
* Find similar past commits based on diff embeddings
*/
async function findSimilarCommits(currentDiff, recentCommits = 50, topK = 5) {
if (!(0, onnx_embedder_1.isReady)()) {
await (0, onnx_embedder_1.initOnnxEmbedder)();
}
// Get current diff embedding
const currentEmbedding = (await (0, onnx_embedder_1.embed)(currentDiff.substring(0, 1000))).embedding;
// Get recent commits
let commits = [];
try {
commits = (0, child_process_1.execSync)(`git log -${recentCommits} --format="%H" 2>/dev/null`, {
encoding: 'utf8',
}).trim().split('\n');
}
catch {
return [];
}
// Analyze and compare
const results = [];
for (const hash of commits.slice(0, Math.min(commits.length, recentCommits))) {
const analysis = await analyzeCommit(hash);
if (analysis.embedding) {
const similarity = cosineSimilarity(currentEmbedding, analysis.embedding);
results.push({ hash, similarity, message: analysis.message });
}
}
return results
.sort((a, b) => b.similarity - a.similarity)
.slice(0, topK);
}
function cosineSimilarity(a, b) {
if (a.length !== b.length)
return 0;
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const magnitude = Math.sqrt(normA) * Math.sqrt(normB);
return magnitude === 0 ? 0 : dotProduct / magnitude;
}
exports.default = {
parseDiff,
classifyChange,
calculateRiskScore,
analyzeFileDiff,
analyzeCommit,
getCommitDiff,
getStagedDiff,
getUnstagedDiff,
findSimilarCommits,
};