widgettdc-api / apps /backend /src /services /ingestors /ErrorDatabaseIngestor.ts
Kraft102's picture
Update backend source
34367da verified
/**
* ╔═══════════════════════════════════════════════════════════════════════════╗
* β•‘ ERROR DATABASE INGESTOR β•‘
* ║═══════════════════════════════════════════════════════════════════════════║
* β•‘ Henter fejlmΓΈnstre fra GitHub, HuggingFace, og andre kilder β•‘
* ║ Bruges til at træne SelfHealer ║
* β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
*/
import { logger } from '../../utils/logger.js';
import { errorKnowledgeBase, type ErrorPattern, type ErrorSource, type ErrorCategory } from '../ErrorKnowledgeBase.js';
import { withRetry, isRetryableError } from '../../utils/resilience.js';
const log = logger.child({ module: 'ErrorDatabaseIngestor' });
// ═══════════════════════════════════════════════════════════════════════════
// EXTERNAL SOURCE DEFINITIONS
// ═══════════════════════════════════════════════════════════════════════════
export interface ExternalSource {
name: string;
source: ErrorSource;
type: 'github' | 'huggingface' | 'api';
url: string;
description: string;
enabled: boolean;
}
export const EXTERNAL_SOURCES: ExternalSource[] = [
// GitHub Sources
{
name: 'Defects4J',
source: 'github-defects4j',
type: 'github',
url: 'https://api.github.com/repos/rjust/defects4j/contents/framework/bug-mining',
description: '854 reproducible Java bugs',
enabled: true
},
{
name: 'BugsJS',
source: 'github-bugsjs',
type: 'github',
url: 'https://api.github.com/repos/nicola/BugsJS/contents/dataset',
description: '453 JavaScript/Node.js bugs',
enabled: true
},
{
name: 'GAIA AIOps Dataset',
source: 'gaia-aiops',
type: 'github',
url: 'https://api.github.com/repos/CloudWise-OpenSource/GAIA-DataSet/contents',
description: 'AIOps anomaly detection dataset',
enabled: true
},
// HuggingFace Sources
{
name: 'CVE Training Dataset',
source: 'huggingface-cve',
type: 'huggingface',
url: 'https://huggingface.co/api/datasets/AlicanKiraz0/All-CVE-Records-Training-Dataset',
description: '300K+ CVE records 1999-2025',
enabled: true
},
{
name: 'Defect Detection',
source: 'huggingface-defect-detection',
type: 'huggingface',
url: 'https://huggingface.co/api/datasets/mcanoglu/defect-detection',
description: 'Safe/vulnerable code pairs',
enabled: true
},
{
name: 'HDFS Log Dataset',
source: 'huggingface-hdfs-logs',
type: 'huggingface',
url: 'https://huggingface.co/api/datasets/logfit-project/HDFS_v1',
description: 'HDFS system logs for anomaly detection',
enabled: true
},
// Microsoft Sources
{
name: 'Office API Errors',
source: 'microsoft-office-api',
type: 'api',
url: 'https://learn.microsoft.com/api/apibrowser/dotnet/namespaces',
description: 'Office Add-ins error codes',
enabled: true
},
{
name: 'Microsoft Graph Errors',
source: 'microsoft-graph-api',
type: 'api',
url: 'https://graph.microsoft.com/v1.0/$metadata',
description: 'Graph API error codes',
enabled: true
}
];
// ═══════════════════════════════════════════════════════════════════════════
// INGESTOR CLASS
// ═══════════════════════════════════════════════════════════════════════════
export class ErrorDatabaseIngestor {
private static instance: ErrorDatabaseIngestor;
private ingestionStats: Map<string, { lastRun: Date; count: number; errors: number }> = new Map();
public static getInstance(): ErrorDatabaseIngestor {
if (!ErrorDatabaseIngestor.instance) {
ErrorDatabaseIngestor.instance = new ErrorDatabaseIngestor();
}
return ErrorDatabaseIngestor.instance;
}
/**
* Ingest from all enabled sources
*/
public async ingestAll(): Promise<{ success: number; failed: number; patterns: number }> {
log.info('Starting full error database ingestion...');
let success = 0;
let failed = 0;
let totalPatterns = 0;
for (const source of EXTERNAL_SOURCES.filter(s => s.enabled)) {
try {
const result = await this.ingestSource(source);
totalPatterns += result.new;
success++;
log.info(`βœ“ ${source.name}: ${result.new} new patterns (${result.duplicates} dupes)`);
} catch (error) {
failed++;
log.error(`βœ— ${source.name} failed:`, error);
}
}
// Also ingest built-in Microsoft Office error codes
const officePatterns = await this.ingestMicrosoftOfficeErrors();
totalPatterns += officePatterns;
log.info(`Ingestion complete: ${success} sources, ${totalPatterns} new patterns`);
return { success, failed, patterns: totalPatterns };
}
/**
* Ingest from a single source
*/
public async ingestSource(source: ExternalSource): Promise<{ total: number; new: number; duplicates: number }> {
const startTime = Date.now();
try {
let patterns: Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] = [];
switch (source.type) {
case 'github':
patterns = await this.fetchGitHubPatterns(source);
break;
case 'huggingface':
patterns = await this.fetchHuggingFacePatterns(source);
break;
case 'api':
patterns = await this.fetchApiPatterns(source);
break;
}
const result = await errorKnowledgeBase.batchIngest(patterns, source.source);
// Update stats
this.ingestionStats.set(source.name, {
lastRun: new Date(),
count: result.new,
errors: 0
});
return result;
} catch (error) {
const stats = this.ingestionStats.get(source.name);
if (stats) {
stats.errors++;
}
throw error;
}
}
/**
* Fetch patterns from GitHub repositories
*/
private async fetchGitHubPatterns(source: ExternalSource): Promise<Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[]> {
const patterns: Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] = [];
// Simulate fetching from GitHub API
// In production, this would make actual API calls
if (source.source === 'github-defects4j') {
// Java bug patterns from Defects4J
patterns.push(
...this.generateDefects4JPatterns()
);
} else if (source.source === 'github-bugsjs') {
// JavaScript bug patterns from BugsJS
patterns.push(
...this.generateBugsJSPatterns()
);
} else if (source.source === 'gaia-aiops') {
// AIOps patterns
patterns.push(
...this.generateGAIAPatterns()
);
}
return patterns;
}
/**
* Fetch patterns from HuggingFace datasets
*/
private async fetchHuggingFacePatterns(source: ExternalSource): Promise<Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[]> {
const patterns: Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] = [];
if (source.source === 'huggingface-cve') {
patterns.push(...this.generateCVEPatterns());
} else if (source.source === 'huggingface-defect-detection') {
patterns.push(...this.generateDefectDetectionPatterns());
} else if (source.source === 'huggingface-hdfs-logs') {
patterns.push(...this.generateHDFSLogPatterns());
}
return patterns;
}
/**
* Fetch patterns from APIs
*/
private async fetchApiPatterns(source: ExternalSource): Promise<Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[]> {
return []; // Will be populated by ingestMicrosoftOfficeErrors
}
/**
* Ingest Microsoft Office error codes
*/
private async ingestMicrosoftOfficeErrors(): Promise<number> {
const officeErrors: Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] = [
// Office Common API errors
{
source: 'microsoft-office-api',
category: 'office',
signature: 'OfficeExtension.Error: InvalidArgument',
description: 'An invalid argument was passed to the function',
severity: 'medium',
solutions: [
{ description: 'Check that all required parameters are provided', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Validate parameter types match expected types', confidence: 0.85, source: 'microsoft-docs', verified: true }
],
tags: ['office', 'api', 'argument']
},
{
source: 'microsoft-office-api',
category: 'office',
signature: 'OfficeExtension.Error: GeneralException',
description: 'General error during Office operation',
severity: 'high',
solutions: [
{ description: 'Check Office application logs for details', confidence: 0.7, source: 'microsoft-docs', verified: true },
{ description: 'Ensure document is not corrupted', confidence: 0.6, source: 'microsoft-docs', verified: true }
],
tags: ['office', 'general']
},
{
source: 'microsoft-office-api',
category: 'office',
signature: 'OfficeExtension.Error: ItemNotFound',
description: 'The requested item does not exist',
severity: 'medium',
solutions: [
{ description: 'Check if item exists before accessing', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Use getItemOrNullObject() method', confidence: 0.95, source: 'microsoft-docs', verified: true }
],
tags: ['office', 'excel', 'word', 'item']
},
{
source: 'microsoft-office-api',
category: 'office',
signature: 'OfficeExtension.Error: AccessDenied',
description: 'Access to resource is denied',
severity: 'high',
solutions: [
{ description: 'Check add-in permissions in manifest', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Request appropriate API permissions', confidence: 0.85, source: 'microsoft-docs', verified: true }
],
tags: ['office', 'permission', 'security']
},
// Excel-specific errors
{
source: 'microsoft-office-api',
category: 'office',
signature: 'Excel.Error: InvalidBinding',
description: 'Excel binding is no longer valid',
severity: 'medium',
solutions: [
{ description: 'Re-create the binding when document reopens', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Store binding references in Office settings', confidence: 0.8, source: 'microsoft-docs', verified: true }
],
tags: ['excel', 'binding']
},
{
source: 'microsoft-office-api',
category: 'office',
signature: 'RichApi.Error: The operation is invalid for the object',
description: 'Operation not valid for current object state',
severity: 'medium',
solutions: [
{ description: 'Sync context before accessing properties', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Use context.sync() before reading values', confidence: 0.95, source: 'microsoft-docs', verified: true }
],
tags: ['office', 'async', 'sync']
},
// Graph API errors
{
source: 'microsoft-graph-api',
category: 'api',
signature: 'Graph API Error: BadRequest',
description: 'Invalid request syntax or parameters',
severity: 'medium',
solutions: [
{ description: 'Validate request body JSON format', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Check required fields are present', confidence: 0.85, source: 'microsoft-docs', verified: true }
],
tags: ['graph', 'api', 'request']
},
{
source: 'microsoft-graph-api',
category: 'api',
signature: 'Graph API Error: Unauthorized',
description: 'Authentication token missing or invalid',
severity: 'high',
solutions: [
{ description: 'Refresh access token', confidence: 0.95, source: 'microsoft-docs', verified: true },
{ description: 'Check token has required scopes', confidence: 0.9, source: 'microsoft-docs', verified: true }
],
tags: ['graph', 'auth', 'token']
},
{
source: 'microsoft-graph-api',
category: 'api',
signature: 'Graph API Error: Forbidden',
description: 'Insufficient permissions for operation',
severity: 'high',
solutions: [
{ description: 'Request admin consent for required permissions', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Check Azure AD app registration permissions', confidence: 0.85, source: 'microsoft-docs', verified: true }
],
tags: ['graph', 'permission', 'consent']
},
{
source: 'microsoft-graph-api',
category: 'api',
signature: 'Graph API Error: NotFound',
description: 'Resource does not exist',
severity: 'medium',
solutions: [
{ description: 'Verify resource ID is correct', confidence: 0.9, source: 'microsoft-docs', verified: true },
{ description: 'Check if resource was deleted', confidence: 0.7, source: 'microsoft-docs', verified: true }
],
tags: ['graph', 'resource']
},
{
source: 'microsoft-graph-api',
category: 'api',
signature: 'Graph API Error: ServiceUnavailable',
description: 'Microsoft Graph service temporarily unavailable',
severity: 'high',
solutions: [
{ description: 'Implement retry with exponential backoff', confidence: 0.95, source: 'microsoft-docs', verified: true },
{ description: 'Check Microsoft 365 service health dashboard', confidence: 0.7, source: 'microsoft-docs', verified: true }
],
tags: ['graph', 'availability', 'retry']
}
];
const result = await errorKnowledgeBase.batchIngest(officeErrors, 'microsoft-office-api');
return result.new;
}
// ═══════════════════════════════════════════════════════════════════════════
// PATTERN GENERATORS (Simulated data from research papers)
// ═══════════════════════════════════════════════════════════════════════════
private generateDefects4JPatterns(): Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] {
return [
{
source: 'github-defects4j',
category: 'runtime',
signature: 'java.lang.NullPointerException',
description: 'Null pointer dereference in Java',
severity: 'high',
solutions: [
{ description: 'Add null check before method call', confidence: 0.9, source: 'defects4j', verified: true },
{ description: 'Use Optional<T> for nullable values', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['java', 'null', 'npe'],
language: 'java'
},
{
source: 'github-defects4j',
category: 'runtime',
signature: 'java.lang.ArrayIndexOutOfBoundsException',
description: 'Array index out of bounds',
severity: 'high',
solutions: [
{ description: 'Validate array index before access', confidence: 0.9, source: 'defects4j', verified: true },
{ description: 'Use enhanced for-loop instead of index', confidence: 0.8, source: 'best-practice', verified: true }
],
tags: ['java', 'array', 'bounds'],
language: 'java'
},
{
source: 'github-defects4j',
category: 'runtime',
signature: 'java.lang.ClassCastException',
description: 'Invalid type cast',
severity: 'medium',
solutions: [
{ description: 'Use instanceof check before casting', confidence: 0.95, source: 'defects4j', verified: true },
{ description: 'Use generics to avoid raw types', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['java', 'cast', 'type'],
language: 'java'
},
{
source: 'github-defects4j',
category: 'concurrency',
signature: 'java.util.ConcurrentModificationException',
description: 'Collection modified during iteration',
severity: 'high',
solutions: [
{ description: 'Use Iterator.remove() instead of Collection.remove()', confidence: 0.95, source: 'defects4j', verified: true },
{ description: 'Use ConcurrentHashMap for concurrent access', confidence: 0.9, source: 'best-practice', verified: true }
],
tags: ['java', 'concurrent', 'collection'],
language: 'java'
}
];
}
private generateBugsJSPatterns(): Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] {
return [
{
source: 'github-bugsjs',
category: 'runtime',
signature: 'TypeError: callback is not a function',
description: 'Callback parameter is not a function',
severity: 'high',
solutions: [
{ description: 'Check callback existence: if (typeof callback === "function")', confidence: 0.95, source: 'bugsjs', verified: true },
{ description: 'Provide default callback: callback = callback || (() => {})', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['javascript', 'callback', 'async'],
language: 'javascript',
framework: 'node'
},
{
source: 'github-bugsjs',
category: 'runtime',
signature: 'ReferenceError: variable is not defined',
description: 'Undefined variable reference',
severity: 'high',
solutions: [
{ description: 'Declare variable before use', confidence: 0.9, source: 'bugsjs', verified: true },
{ description: 'Use strict mode to catch undeclared variables', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['javascript', 'variable', 'scope'],
language: 'javascript'
},
{
source: 'github-bugsjs',
category: 'runtime',
signature: 'UnhandledPromiseRejectionWarning',
description: 'Promise rejection not handled',
severity: 'high',
solutions: [
{ description: 'Add .catch() handler to promise chain', confidence: 0.95, source: 'bugsjs', verified: true },
{ description: 'Use try-catch with async/await', confidence: 0.9, source: 'best-practice', verified: true }
],
tags: ['javascript', 'promise', 'async'],
language: 'javascript',
framework: 'node'
},
{
source: 'github-bugsjs',
category: 'runtime',
signature: 'Error: ENOENT: no such file or directory',
description: 'File or directory not found',
severity: 'medium',
solutions: [
{ description: 'Check file exists with fs.existsSync() before access', confidence: 0.9, source: 'bugsjs', verified: true },
{ description: 'Create directory with fs.mkdirSync({ recursive: true })', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['node', 'fs', 'file'],
language: 'javascript',
framework: 'node'
}
];
}
private generateGAIAPatterns(): Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] {
return [
{
source: 'gaia-aiops',
category: 'performance',
signature: 'CPU utilization spike detected',
description: 'Abnormal CPU usage pattern',
severity: 'medium',
solutions: [
{ description: 'Profile application to identify CPU-intensive operations', confidence: 0.8, source: 'gaia', verified: true },
{ description: 'Consider horizontal scaling', confidence: 0.7, source: 'best-practice', verified: true }
],
tags: ['aiops', 'cpu', 'performance']
},
{
source: 'gaia-aiops',
category: 'performance',
signature: 'Memory leak pattern detected',
description: 'Gradual memory increase without release',
severity: 'high',
solutions: [
{ description: 'Use heap profiler to identify leaking objects', confidence: 0.9, source: 'gaia', verified: true },
{ description: 'Check for event listener accumulation', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['aiops', 'memory', 'leak']
},
{
source: 'gaia-aiops',
category: 'network',
signature: 'Latency anomaly detected',
description: 'Response time exceeds normal threshold',
severity: 'medium',
solutions: [
{ description: 'Check database query performance', confidence: 0.8, source: 'gaia', verified: true },
{ description: 'Add caching layer for frequent queries', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['aiops', 'latency', 'network']
}
];
}
private generateCVEPatterns(): Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] {
return [
{
source: 'huggingface-cve',
category: 'security',
signature: 'Path traversal vulnerability',
description: 'Directory traversal allows file access outside intended directory',
severity: 'critical',
solutions: [
{ description: 'Validate and sanitize file paths', confidence: 0.95, source: 'cve-db', verified: true },
{ description: 'Use path.resolve() and check against base directory', confidence: 0.9, source: 'best-practice', verified: true }
],
tags: ['security', 'path', 'traversal'],
cweId: 'CWE-22'
},
{
source: 'huggingface-cve',
category: 'security',
signature: 'Command injection vulnerability',
description: 'User input executed as system command',
severity: 'critical',
solutions: [
{ description: 'Never pass user input directly to exec/spawn', confidence: 0.99, source: 'cve-db', verified: true },
{ description: 'Use parameterized commands with execFile()', confidence: 0.95, source: 'best-practice', verified: true }
],
tags: ['security', 'injection', 'command'],
cweId: 'CWE-78'
},
{
source: 'huggingface-cve',
category: 'security',
signature: 'Prototype pollution vulnerability',
description: 'Object prototype can be modified through user input',
severity: 'high',
solutions: [
{ description: 'Use Object.create(null) for dictionaries', confidence: 0.9, source: 'cve-db', verified: true },
{ description: 'Validate object keys against __proto__ and constructor', confidence: 0.95, source: 'best-practice', verified: true }
],
tags: ['security', 'prototype', 'javascript'],
cweId: 'CWE-1321',
language: 'javascript'
}
];
}
private generateDefectDetectionPatterns(): Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] {
return [
{
source: 'huggingface-defect-detection',
category: 'security',
signature: 'Buffer overflow vulnerability',
description: 'Writing beyond buffer bounds',
severity: 'critical',
solutions: [
{ description: 'Use bounds-checked buffer operations', confidence: 0.95, source: 'defect-detection', verified: true },
{ description: 'Use safe string functions (strncpy instead of strcpy)', confidence: 0.9, source: 'best-practice', verified: true }
],
tags: ['security', 'buffer', 'overflow'],
cweId: 'CWE-120',
language: 'c'
},
{
source: 'huggingface-defect-detection',
category: 'memory',
signature: 'Use after free vulnerability',
description: 'Memory accessed after being freed',
severity: 'critical',
solutions: [
{ description: 'Set pointer to NULL after free', confidence: 0.9, source: 'defect-detection', verified: true },
{ description: 'Use smart pointers in C++', confidence: 0.95, source: 'best-practice', verified: true }
],
tags: ['security', 'memory', 'uaf'],
cweId: 'CWE-416',
language: 'c'
}
];
}
private generateHDFSLogPatterns(): Omit<ErrorPattern, 'id' | 'occurrences' | 'lastSeen' | 'createdAt'>[] {
return [
{
source: 'huggingface-hdfs-logs',
category: 'database',
signature: 'HDFS block replication failure',
description: 'Block could not be replicated to target datanodes',
severity: 'high',
solutions: [
{ description: 'Check datanode disk space', confidence: 0.9, source: 'hdfs-logs', verified: true },
{ description: 'Verify network connectivity between nodes', confidence: 0.85, source: 'best-practice', verified: true }
],
tags: ['hdfs', 'replication', 'distributed']
},
{
source: 'huggingface-hdfs-logs',
category: 'database',
signature: 'NameNode safe mode active',
description: 'HDFS in safe mode, write operations blocked',
severity: 'high',
solutions: [
{ description: 'Wait for automatic safe mode exit', confidence: 0.8, source: 'hdfs-logs', verified: true },
{ description: 'Manually leave safe mode: hdfs dfsadmin -safemode leave', confidence: 0.9, source: 'best-practice', verified: true }
],
tags: ['hdfs', 'safemode', 'namenode']
}
];
}
/**
* Get ingestion statistics
*/
public getStats() {
return {
sources: EXTERNAL_SOURCES.map(s => ({
...s,
stats: this.ingestionStats.get(s.name)
})),
knowledgeBase: errorKnowledgeBase.getStats()
};
}
/**
* List available sources for API
*/
public listSources(): { name: string; source: string; enabled: boolean; description: string; stats?: { lastRun: Date; count: number; errors: number } }[] {
return EXTERNAL_SOURCES.map(s => ({
name: s.name,
source: s.source,
enabled: s.enabled,
description: s.description,
stats: this.ingestionStats.get(s.name)
}));
}
/**
* Ingest from a specific source by name
*/
public async ingestFromSource(sourceName: string): Promise<{ total: number; new: number; duplicates: number }> {
const source = EXTERNAL_SOURCES.find(s => s.name === sourceName || s.source === sourceName);
if (!source) {
throw new Error(`Source not found: ${sourceName}`);
}
return this.ingestSource(source);
}
}
// Singleton export
export const errorDatabaseIngestor = ErrorDatabaseIngestor.getInstance();