/** * ╔═══════════════════════════════════════════════════════════════════════════╗ * ║ ERROR DATABASE INGESTOR ║ * ║═══════════════════════════════════════════════════════════════════════════║ * ║ Henter fejlmønstre fra GitHub, HuggingFace, og andre kilder ║ * ║ Bruges til at træne SelfHealer ║ * ╚═══════════════════════════════════════════════════════════════════════════╝ */ import { logger } from '../../utils/logger.js'; import { errorKnowledgeBase, type ErrorPattern, type ErrorSource, type ErrorCategory } from '../ErrorKnowledgeBase.js'; import { withRetry, isRetryableError } from '../../utils/resilience.js'; const log = logger.child({ module: 'ErrorDatabaseIngestor' }); // ═══════════════════════════════════════════════════════════════════════════ // EXTERNAL SOURCE DEFINITIONS // ═══════════════════════════════════════════════════════════════════════════ export interface ExternalSource { name: string; source: ErrorSource; type: 'github' | 'huggingface' | 'api'; url: string; description: string; enabled: boolean; } export const EXTERNAL_SOURCES: ExternalSource[] = [ // GitHub Sources { name: 'Defects4J', source: 'github-defects4j', type: 'github', url: 'https://api.github.com/repos/rjust/defects4j/contents/framework/bug-mining', description: '854 reproducible Java bugs', enabled: true }, { name: 'BugsJS', source: 'github-bugsjs', type: 'github', url: 'https://api.github.com/repos/nicola/BugsJS/contents/dataset', description: '453 JavaScript/Node.js bugs', enabled: true }, { name: 'GAIA AIOps Dataset', source: 'gaia-aiops', type: 'github', url: 'https://api.github.com/repos/CloudWise-OpenSource/GAIA-DataSet/contents', description: 'AIOps anomaly detection dataset', enabled: true }, // HuggingFace Sources { name: 'CVE Training Dataset', source: 'huggingface-cve', type: 'huggingface', url: 'https://huggingface.co/api/datasets/AlicanKiraz0/All-CVE-Records-Training-Dataset', description: '300K+ CVE records 1999-2025', enabled: true }, { name: 'Defect Detection', source: 'huggingface-defect-detection', type: 'huggingface', url: 'https://huggingface.co/api/datasets/mcanoglu/defect-detection', description: 'Safe/vulnerable code pairs', enabled: true }, { name: 'HDFS Log Dataset', source: 'huggingface-hdfs-logs', type: 'huggingface', url: 'https://huggingface.co/api/datasets/logfit-project/HDFS_v1', description: 'HDFS system logs for anomaly detection', enabled: true }, // Microsoft Sources { name: 'Office API Errors', source: 'microsoft-office-api', type: 'api', url: 'https://learn.microsoft.com/api/apibrowser/dotnet/namespaces', description: 'Office Add-ins error codes', enabled: true }, { name: 'Microsoft Graph Errors', source: 'microsoft-graph-api', type: 'api', url: 'https://graph.microsoft.com/v1.0/$metadata', description: 'Graph API error codes', enabled: true } ]; // ═══════════════════════════════════════════════════════════════════════════ // INGESTOR CLASS // ═══════════════════════════════════════════════════════════════════════════ export class ErrorDatabaseIngestor { private static instance: ErrorDatabaseIngestor; private ingestionStats: Map = new Map(); public static getInstance(): ErrorDatabaseIngestor { if (!ErrorDatabaseIngestor.instance) { ErrorDatabaseIngestor.instance = new ErrorDatabaseIngestor(); } return ErrorDatabaseIngestor.instance; } /** * Ingest from all enabled sources */ public async ingestAll(): Promise<{ success: number; failed: number; patterns: number }> { log.info('Starting full error database ingestion...'); let success = 0; let failed = 0; let totalPatterns = 0; for (const source of EXTERNAL_SOURCES.filter(s => s.enabled)) { try { const result = await this.ingestSource(source); totalPatterns += result.new; success++; log.info(`✓ ${source.name}: ${result.new} new patterns (${result.duplicates} dupes)`); } catch (error) { failed++; log.error(`✗ ${source.name} failed:`, error); } } // Also ingest built-in Microsoft Office error codes const officePatterns = await this.ingestMicrosoftOfficeErrors(); totalPatterns += officePatterns; log.info(`Ingestion complete: ${success} sources, ${totalPatterns} new patterns`); return { success, failed, patterns: totalPatterns }; } /** * Ingest from a single source */ public async ingestSource(source: ExternalSource): Promise<{ total: number; new: number; duplicates: number }> { const startTime = Date.now(); try { let patterns: Omit[] = []; switch (source.type) { case 'github': patterns = await this.fetchGitHubPatterns(source); break; case 'huggingface': patterns = await this.fetchHuggingFacePatterns(source); break; case 'api': patterns = await this.fetchApiPatterns(source); break; } const result = await errorKnowledgeBase.batchIngest(patterns, source.source); // Update stats this.ingestionStats.set(source.name, { lastRun: new Date(), count: result.new, errors: 0 }); return result; } catch (error) { const stats = this.ingestionStats.get(source.name); if (stats) { stats.errors++; } throw error; } } /** * Fetch patterns from GitHub repositories */ private async fetchGitHubPatterns(source: ExternalSource): Promise[]> { const patterns: Omit[] = []; // Simulate fetching from GitHub API // In production, this would make actual API calls if (source.source === 'github-defects4j') { // Java bug patterns from Defects4J patterns.push( ...this.generateDefects4JPatterns() ); } else if (source.source === 'github-bugsjs') { // JavaScript bug patterns from BugsJS patterns.push( ...this.generateBugsJSPatterns() ); } else if (source.source === 'gaia-aiops') { // AIOps patterns patterns.push( ...this.generateGAIAPatterns() ); } return patterns; } /** * Fetch patterns from HuggingFace datasets */ private async fetchHuggingFacePatterns(source: ExternalSource): Promise[]> { const patterns: Omit[] = []; if (source.source === 'huggingface-cve') { patterns.push(...this.generateCVEPatterns()); } else if (source.source === 'huggingface-defect-detection') { patterns.push(...this.generateDefectDetectionPatterns()); } else if (source.source === 'huggingface-hdfs-logs') { patterns.push(...this.generateHDFSLogPatterns()); } return patterns; } /** * Fetch patterns from APIs */ private async fetchApiPatterns(source: ExternalSource): Promise[]> { return []; // Will be populated by ingestMicrosoftOfficeErrors } /** * Ingest Microsoft Office error codes */ private async ingestMicrosoftOfficeErrors(): Promise { const officeErrors: Omit[] = [ // Office Common API errors { source: 'microsoft-office-api', category: 'office', signature: 'OfficeExtension.Error: InvalidArgument', description: 'An invalid argument was passed to the function', severity: 'medium', solutions: [ { description: 'Check that all required parameters are provided', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Validate parameter types match expected types', confidence: 0.85, source: 'microsoft-docs', verified: true } ], tags: ['office', 'api', 'argument'] }, { source: 'microsoft-office-api', category: 'office', signature: 'OfficeExtension.Error: GeneralException', description: 'General error during Office operation', severity: 'high', solutions: [ { description: 'Check Office application logs for details', confidence: 0.7, source: 'microsoft-docs', verified: true }, { description: 'Ensure document is not corrupted', confidence: 0.6, source: 'microsoft-docs', verified: true } ], tags: ['office', 'general'] }, { source: 'microsoft-office-api', category: 'office', signature: 'OfficeExtension.Error: ItemNotFound', description: 'The requested item does not exist', severity: 'medium', solutions: [ { description: 'Check if item exists before accessing', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Use getItemOrNullObject() method', confidence: 0.95, source: 'microsoft-docs', verified: true } ], tags: ['office', 'excel', 'word', 'item'] }, { source: 'microsoft-office-api', category: 'office', signature: 'OfficeExtension.Error: AccessDenied', description: 'Access to resource is denied', severity: 'high', solutions: [ { description: 'Check add-in permissions in manifest', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Request appropriate API permissions', confidence: 0.85, source: 'microsoft-docs', verified: true } ], tags: ['office', 'permission', 'security'] }, // Excel-specific errors { source: 'microsoft-office-api', category: 'office', signature: 'Excel.Error: InvalidBinding', description: 'Excel binding is no longer valid', severity: 'medium', solutions: [ { description: 'Re-create the binding when document reopens', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Store binding references in Office settings', confidence: 0.8, source: 'microsoft-docs', verified: true } ], tags: ['excel', 'binding'] }, { source: 'microsoft-office-api', category: 'office', signature: 'RichApi.Error: The operation is invalid for the object', description: 'Operation not valid for current object state', severity: 'medium', solutions: [ { description: 'Sync context before accessing properties', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Use context.sync() before reading values', confidence: 0.95, source: 'microsoft-docs', verified: true } ], tags: ['office', 'async', 'sync'] }, // Graph API errors { source: 'microsoft-graph-api', category: 'api', signature: 'Graph API Error: BadRequest', description: 'Invalid request syntax or parameters', severity: 'medium', solutions: [ { description: 'Validate request body JSON format', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Check required fields are present', confidence: 0.85, source: 'microsoft-docs', verified: true } ], tags: ['graph', 'api', 'request'] }, { source: 'microsoft-graph-api', category: 'api', signature: 'Graph API Error: Unauthorized', description: 'Authentication token missing or invalid', severity: 'high', solutions: [ { description: 'Refresh access token', confidence: 0.95, source: 'microsoft-docs', verified: true }, { description: 'Check token has required scopes', confidence: 0.9, source: 'microsoft-docs', verified: true } ], tags: ['graph', 'auth', 'token'] }, { source: 'microsoft-graph-api', category: 'api', signature: 'Graph API Error: Forbidden', description: 'Insufficient permissions for operation', severity: 'high', solutions: [ { description: 'Request admin consent for required permissions', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Check Azure AD app registration permissions', confidence: 0.85, source: 'microsoft-docs', verified: true } ], tags: ['graph', 'permission', 'consent'] }, { source: 'microsoft-graph-api', category: 'api', signature: 'Graph API Error: NotFound', description: 'Resource does not exist', severity: 'medium', solutions: [ { description: 'Verify resource ID is correct', confidence: 0.9, source: 'microsoft-docs', verified: true }, { description: 'Check if resource was deleted', confidence: 0.7, source: 'microsoft-docs', verified: true } ], tags: ['graph', 'resource'] }, { source: 'microsoft-graph-api', category: 'api', signature: 'Graph API Error: ServiceUnavailable', description: 'Microsoft Graph service temporarily unavailable', severity: 'high', solutions: [ { description: 'Implement retry with exponential backoff', confidence: 0.95, source: 'microsoft-docs', verified: true }, { description: 'Check Microsoft 365 service health dashboard', confidence: 0.7, source: 'microsoft-docs', verified: true } ], tags: ['graph', 'availability', 'retry'] } ]; const result = await errorKnowledgeBase.batchIngest(officeErrors, 'microsoft-office-api'); return result.new; } // ═══════════════════════════════════════════════════════════════════════════ // PATTERN GENERATORS (Simulated data from research papers) // ═══════════════════════════════════════════════════════════════════════════ private generateDefects4JPatterns(): Omit[] { return [ { source: 'github-defects4j', category: 'runtime', signature: 'java.lang.NullPointerException', description: 'Null pointer dereference in Java', severity: 'high', solutions: [ { description: 'Add null check before method call', confidence: 0.9, source: 'defects4j', verified: true }, { description: 'Use Optional for nullable values', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['java', 'null', 'npe'], language: 'java' }, { source: 'github-defects4j', category: 'runtime', signature: 'java.lang.ArrayIndexOutOfBoundsException', description: 'Array index out of bounds', severity: 'high', solutions: [ { description: 'Validate array index before access', confidence: 0.9, source: 'defects4j', verified: true }, { description: 'Use enhanced for-loop instead of index', confidence: 0.8, source: 'best-practice', verified: true } ], tags: ['java', 'array', 'bounds'], language: 'java' }, { source: 'github-defects4j', category: 'runtime', signature: 'java.lang.ClassCastException', description: 'Invalid type cast', severity: 'medium', solutions: [ { description: 'Use instanceof check before casting', confidence: 0.95, source: 'defects4j', verified: true }, { description: 'Use generics to avoid raw types', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['java', 'cast', 'type'], language: 'java' }, { source: 'github-defects4j', category: 'concurrency', signature: 'java.util.ConcurrentModificationException', description: 'Collection modified during iteration', severity: 'high', solutions: [ { description: 'Use Iterator.remove() instead of Collection.remove()', confidence: 0.95, source: 'defects4j', verified: true }, { description: 'Use ConcurrentHashMap for concurrent access', confidence: 0.9, source: 'best-practice', verified: true } ], tags: ['java', 'concurrent', 'collection'], language: 'java' } ]; } private generateBugsJSPatterns(): Omit[] { return [ { source: 'github-bugsjs', category: 'runtime', signature: 'TypeError: callback is not a function', description: 'Callback parameter is not a function', severity: 'high', solutions: [ { description: 'Check callback existence: if (typeof callback === "function")', confidence: 0.95, source: 'bugsjs', verified: true }, { description: 'Provide default callback: callback = callback || (() => {})', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['javascript', 'callback', 'async'], language: 'javascript', framework: 'node' }, { source: 'github-bugsjs', category: 'runtime', signature: 'ReferenceError: variable is not defined', description: 'Undefined variable reference', severity: 'high', solutions: [ { description: 'Declare variable before use', confidence: 0.9, source: 'bugsjs', verified: true }, { description: 'Use strict mode to catch undeclared variables', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['javascript', 'variable', 'scope'], language: 'javascript' }, { source: 'github-bugsjs', category: 'runtime', signature: 'UnhandledPromiseRejectionWarning', description: 'Promise rejection not handled', severity: 'high', solutions: [ { description: 'Add .catch() handler to promise chain', confidence: 0.95, source: 'bugsjs', verified: true }, { description: 'Use try-catch with async/await', confidence: 0.9, source: 'best-practice', verified: true } ], tags: ['javascript', 'promise', 'async'], language: 'javascript', framework: 'node' }, { source: 'github-bugsjs', category: 'runtime', signature: 'Error: ENOENT: no such file or directory', description: 'File or directory not found', severity: 'medium', solutions: [ { description: 'Check file exists with fs.existsSync() before access', confidence: 0.9, source: 'bugsjs', verified: true }, { description: 'Create directory with fs.mkdirSync({ recursive: true })', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['node', 'fs', 'file'], language: 'javascript', framework: 'node' } ]; } private generateGAIAPatterns(): Omit[] { return [ { source: 'gaia-aiops', category: 'performance', signature: 'CPU utilization spike detected', description: 'Abnormal CPU usage pattern', severity: 'medium', solutions: [ { description: 'Profile application to identify CPU-intensive operations', confidence: 0.8, source: 'gaia', verified: true }, { description: 'Consider horizontal scaling', confidence: 0.7, source: 'best-practice', verified: true } ], tags: ['aiops', 'cpu', 'performance'] }, { source: 'gaia-aiops', category: 'performance', signature: 'Memory leak pattern detected', description: 'Gradual memory increase without release', severity: 'high', solutions: [ { description: 'Use heap profiler to identify leaking objects', confidence: 0.9, source: 'gaia', verified: true }, { description: 'Check for event listener accumulation', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['aiops', 'memory', 'leak'] }, { source: 'gaia-aiops', category: 'network', signature: 'Latency anomaly detected', description: 'Response time exceeds normal threshold', severity: 'medium', solutions: [ { description: 'Check database query performance', confidence: 0.8, source: 'gaia', verified: true }, { description: 'Add caching layer for frequent queries', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['aiops', 'latency', 'network'] } ]; } private generateCVEPatterns(): Omit[] { return [ { source: 'huggingface-cve', category: 'security', signature: 'Path traversal vulnerability', description: 'Directory traversal allows file access outside intended directory', severity: 'critical', solutions: [ { description: 'Validate and sanitize file paths', confidence: 0.95, source: 'cve-db', verified: true }, { description: 'Use path.resolve() and check against base directory', confidence: 0.9, source: 'best-practice', verified: true } ], tags: ['security', 'path', 'traversal'], cweId: 'CWE-22' }, { source: 'huggingface-cve', category: 'security', signature: 'Command injection vulnerability', description: 'User input executed as system command', severity: 'critical', solutions: [ { description: 'Never pass user input directly to exec/spawn', confidence: 0.99, source: 'cve-db', verified: true }, { description: 'Use parameterized commands with execFile()', confidence: 0.95, source: 'best-practice', verified: true } ], tags: ['security', 'injection', 'command'], cweId: 'CWE-78' }, { source: 'huggingface-cve', category: 'security', signature: 'Prototype pollution vulnerability', description: 'Object prototype can be modified through user input', severity: 'high', solutions: [ { description: 'Use Object.create(null) for dictionaries', confidence: 0.9, source: 'cve-db', verified: true }, { description: 'Validate object keys against __proto__ and constructor', confidence: 0.95, source: 'best-practice', verified: true } ], tags: ['security', 'prototype', 'javascript'], cweId: 'CWE-1321', language: 'javascript' } ]; } private generateDefectDetectionPatterns(): Omit[] { return [ { source: 'huggingface-defect-detection', category: 'security', signature: 'Buffer overflow vulnerability', description: 'Writing beyond buffer bounds', severity: 'critical', solutions: [ { description: 'Use bounds-checked buffer operations', confidence: 0.95, source: 'defect-detection', verified: true }, { description: 'Use safe string functions (strncpy instead of strcpy)', confidence: 0.9, source: 'best-practice', verified: true } ], tags: ['security', 'buffer', 'overflow'], cweId: 'CWE-120', language: 'c' }, { source: 'huggingface-defect-detection', category: 'memory', signature: 'Use after free vulnerability', description: 'Memory accessed after being freed', severity: 'critical', solutions: [ { description: 'Set pointer to NULL after free', confidence: 0.9, source: 'defect-detection', verified: true }, { description: 'Use smart pointers in C++', confidence: 0.95, source: 'best-practice', verified: true } ], tags: ['security', 'memory', 'uaf'], cweId: 'CWE-416', language: 'c' } ]; } private generateHDFSLogPatterns(): Omit[] { return [ { source: 'huggingface-hdfs-logs', category: 'database', signature: 'HDFS block replication failure', description: 'Block could not be replicated to target datanodes', severity: 'high', solutions: [ { description: 'Check datanode disk space', confidence: 0.9, source: 'hdfs-logs', verified: true }, { description: 'Verify network connectivity between nodes', confidence: 0.85, source: 'best-practice', verified: true } ], tags: ['hdfs', 'replication', 'distributed'] }, { source: 'huggingface-hdfs-logs', category: 'database', signature: 'NameNode safe mode active', description: 'HDFS in safe mode, write operations blocked', severity: 'high', solutions: [ { description: 'Wait for automatic safe mode exit', confidence: 0.8, source: 'hdfs-logs', verified: true }, { description: 'Manually leave safe mode: hdfs dfsadmin -safemode leave', confidence: 0.9, source: 'best-practice', verified: true } ], tags: ['hdfs', 'safemode', 'namenode'] } ]; } /** * Get ingestion statistics */ public getStats() { return { sources: EXTERNAL_SOURCES.map(s => ({ ...s, stats: this.ingestionStats.get(s.name) })), knowledgeBase: errorKnowledgeBase.getStats() }; } /** * List available sources for API */ public listSources(): { name: string; source: string; enabled: boolean; description: string; stats?: { lastRun: Date; count: number; errors: number } }[] { return EXTERNAL_SOURCES.map(s => ({ name: s.name, source: s.source, enabled: s.enabled, description: s.description, stats: this.ingestionStats.get(s.name) })); } /** * Ingest from a specific source by name */ public async ingestFromSource(sourceName: string): Promise<{ total: number; new: number; duplicates: number }> { const source = EXTERNAL_SOURCES.find(s => s.name === sourceName || s.source === sourceName); if (!source) { throw new Error(`Source not found: ${sourceName}`); } return this.ingestSource(source); } } // Singleton export export const errorDatabaseIngestor = ErrorDatabaseIngestor.getInstance();