Spaces:

Kraft102
/

widgetdc-cortex

Paused

App Files Files Community

Kraft102 commited on Dec 15, 2025

Commit

a84b07b

verified ·

1 Parent(s): 66f3b51

Deploy from GitHub Actions 2025-12-15_13-51-53

Browse files

Files changed (5) hide show

apps/backend/package.json +4 -3
apps/backend/src/mcp/ingestionHandlers.ts +25 -18
apps/backend/src/scripts/ingest-drive.ts +69 -0
apps/backend/src/services/ingestion/DataIngestionEngine.ts +6 -0
apps/backend/src/services/ingestion/WebCrawlerAdapter.ts +80 -7

apps/backend/package.json CHANGED Viewed

@@ -6,12 +6,13 @@
   "scripts": {
     "dev": "tsx watch src/index.ts",
     "build": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --format=esm",
-    "build-fixed": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --external:fs --external:path --external:os --format=esm",
     "build:tsc": "tsc",
     "start": "node dist/index.js",
     "test": "vitest run",
     "neural-bridge": "tsx src/mcp/servers/NeuralBridgeServer.ts",
-    "neural-bridge:build": "tsc && node dist/mcp/servers/NeuralBridgeServer.js"
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.71.0",
@@ -86,4 +87,4 @@
     "tsx": "^4.20.6",
     "typescript": "~5.8.2"
   }
-}

   "scripts": {
     "dev": "tsx watch src/index.ts",
     "build": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --format=esm",
+    "build-fixed": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --external:fs --external:path --external:os --external:dotenv --format=esm",
     "build:tsc": "tsc",
     "start": "node dist/index.js",
     "test": "vitest run",
     "neural-bridge": "tsx src/mcp/servers/NeuralBridgeServer.ts",
+    "neural-bridge:build": "tsc && node dist/mcp/servers/NeuralBridgeServer.js",
+    "ingest-drive": "tsx src/scripts/ingest-drive.ts"
   },
   "dependencies": {
     "@anthropic-ai/sdk": "^0.71.0",
     "tsx": "^4.20.6",
     "typescript": "~5.8.2"
   }
+}

apps/backend/src/mcp/ingestionHandlers.ts CHANGED Viewed

@@ -72,32 +72,39 @@ export async function ingestionStartHandler(params: any): Promise<any> {
 export async function ingestionCrawlHandler(params: any): Promise<any> {
     await initializeAdapters();
-    const url = params.url;
     if (!url) {
         return { success: false, error: 'URL is required' };
     }
     try {
-        // Force the Web Crawler adapter to handle this URL
-        // In a real system, the engine might route based on pattern, but here we invoke directly or rely on the engine identifying it's a URL
-        // Since we registered WebCrawler, ingestionEngine.ingestFrom(url) should work if the adapter's fetch handles it.
-        // However, standard ingestFrom typically takes an adapter ID or similar if it's scanning.
-        // But WebCrawlerAdapter.fetch takes a source string.
-        // We need to ensure dataIngestionEngine routes "http..." strings to WebCrawlerAdapter.
-        // For now, we can manually invoke the adapter if we can access it, OR simpler:
-        // We trust ingestFrom to iterate adapters or we specifically use the crawler instance if we exposed it.
-        // Let's assume ingestFrom(url) works if we modified DataIngestionEngine to handle specific inputs,
-        // OR we just perform the crawl here directly using a new instance if needed, but better to go through engine.
-        const count = await dataIngestionEngine.ingestFrom(url);
         return {
             success: true,
-            message: `Crawled and ingested ${url}`,
             count
         };
     } catch (error: any) {
         return { success: false, error: error.message };
     }
 }
@@ -106,7 +113,7 @@ export async function ingestionHarvestHandler(params: any): Promise<any> {
     // Trigger the autonomous harvester
     // We don't await the full process to prevent timeout, but we start it.
     autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
     return {
         success: true,
         message: 'Autonomous Harvest Sequence Initiated',
@@ -119,7 +126,7 @@ export async function ingestionStatusHandler(params: any): Promise<any> {
     // Get real stats from Vector Store
     const vectorStore = getNeo4jVectorStore();
     const stats = await vectorStore.getStatistics();
     const engineStatus = dataIngestionEngine.getStatus();
     return {

 export async function ingestionCrawlHandler(params: any): Promise<any> {
     await initializeAdapters();
+    const { url, mode, depth, keywords } = params;
     if (!url) {
         return { success: false, error: 'URL is required' };
     }
     try {
+        const crawler = dataIngestionEngine.getAdapter('Web Crawler') as WebCrawlerAdapter;
+        if (!crawler) {
+            return { success: false, error: 'Web Crawler adapter not initialized' };
+        }
+        // Configure crawler with request parameters
+        crawler.configure({
+            mode: mode || 'single',
+            maxDepth: depth !== undefined ? depth : 1,
+            keywords: keywords || []
+        });
+        // Add URL to crawler queue
+        crawler.addUrls([url]);
+        // Trigger specific ingestion for the crawler
+        const count = await dataIngestionEngine.ingestFrom('Web Crawler');
         return {
             success: true,
+            message: `Crawled ${url} (Mode: ${mode || 'single'}, Depth: ${depth}, Keywords: ${keywords?.length || 0})`,
             count
         };
     } catch (error: any) {
+        console.error('Crawl handler error:', error);
         return { success: false, error: error.message };
     }
 }
     // Trigger the autonomous harvester
     // We don't await the full process to prevent timeout, but we start it.
     autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
     return {
         success: true,
         message: 'Autonomous Harvest Sequence Initiated',
     // Get real stats from Vector Store
     const vectorStore = getNeo4jVectorStore();
     const stats = await vectorStore.getStatistics();
     const engineStatus = dataIngestionEngine.getStatus();
     return {

apps/backend/src/scripts/ingest-drive.ts ADDED Viewed

	@@ -0,0 +1,69 @@

+import * as dotenv from 'dotenv';
+import * as path from 'path';
+import { ingestRepository } from '../services/GraphIngestor.js';
+import { neo4jAdapter } from '../adapters/Neo4jAdapter.js';
+dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });
+const FORBIDDEN_PATHS = [
+    /^c:[\/\\]?$/i, // C:, C:\, C:/
+    /^c:[\/\\]windows/i,
+    /^c:[\/\\]program files/i,
+    /^c:[\/\\]program files \(x86\)/i,
+    /^c:[\/\\]users[\/\\][^\\\/]+[\/\\]appdata/i // AppData
+];
+async function main() {
+    const args = process.argv.slice(2);
+    const targetPath = args[0];
+    if (!targetPath) {
+        console.error('❌ Usage: npm run ingest-drive -- <path>');
+        console.error('Example: npm run ingest-drive -- "C:\\Users\\claus\\Documents"');
+        process.exit(1);
+    }
+    // Safety checks
+    const normalizedPath = path.normalize(targetPath);
+    for (const forbidden of FORBIDDEN_PATHS) {
+        if (forbidden.test(normalizedPath)) {
+            console.error(`❌ DENIED: Ingesting '${normalizedPath}' is restricted for safety reasons.`);
+            console.error('Please point to a specific subdirectory (e.g., Documents, Projects).');
+            process.exit(1);
+        }
+    }
+    console.log(`🚀 Starting ingestion of: ${normalizedPath}`);
+    console.log('NOTE: This may take a while depending on drive size.');
+    try {
+        const result = await ingestRepository({
+            rootPath: normalizedPath,
+            maxDepth: 10,
+            parseContent: true, // Read file content
+            generateEmbeddings: true, // Generate vectors
+            includePatterns: ['*'],
+            excludePatterns: [
+                'node_modules', '.git', 'dist', 'build', 'coverage',
+                '$Recycle.Bin', 'System Volume Information', 'pagefile.sys', 'swapfile.sys',
+                '.vscode', '.idea'
+            ]
+        });
+        if (result.success) {
+            console.log('✅ Ingestion completed successfully!');
+            console.log('Stats:', result.stats);
+        } else {
+            console.error('❌ Ingestion finished with errors:', result.errors);
+        }
+    } catch (error) {
+        console.error('💥 Fatal error:', error);
+    } finally {
+        await neo4jAdapter.close();
+        process.exit(0);
+    }
+}
+main();

apps/backend/src/services/ingestion/DataIngestionEngine.ts CHANGED Viewed

@@ -40,6 +40,12 @@ export class DataIngestionEngine {
         console.log(`📥 Registered data adapter: ${adapter.name} (${adapter.type}) - ${canUse ? 'Ready' : 'Awaiting approval'}`);
     }
     /** Start ingestion from all registered adapters */
     async ingestAll(): Promise<void> {
         if (this.isRunning) {

         console.log(`📥 Registered data adapter: ${adapter.name} (${adapter.type}) - ${canUse ? 'Ready' : 'Awaiting approval'}`);
     }
+    /** Get a registered adapter */
+    getAdapter(name: string): DataSourceAdapter | undefined {
+        return this.adapters.get(name);
+    }
     /** Start ingestion from all registered adapters */
     async ingestAll(): Promise<void> {
         if (this.isRunning) {

apps/backend/src/services/ingestion/WebCrawlerAdapter.ts CHANGED Viewed

@@ -1,6 +1,13 @@
 import axios from 'axios';
 import * as cheerio from 'cheerio';
 import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
 export class WebCrawlerAdapter implements DataSourceAdapter {
     name = 'Web Crawler';
@@ -8,27 +15,92 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
     private visitedUrls: Set<string> = new Set();
     private urlsToFetch: string[] = [];
-    constructor() {}
     /** Add URLs to fetch queue */
     addUrls(urls: string[]): void {
-        this.urlsToFetch = urls.filter(u => u.startsWith('http'));
     }
     async fetch(): Promise<any[]> {
         const results: any[] = [];
-        for (const url of this.urlsToFetch) {
             if (this.visitedUrls.has(url)) continue;
             this.visitedUrls.add(url);
-            console.log(`🕷️ Crawling: ${url}`);
             try {
                 const response = await axios.get(url, {
                     timeout: 10000,
-                    headers: { 'User-Agent': 'WidgeTDC-Crawler/1.0' }
                 });
-                results.push({ url, html: response.data });
             } catch (error: any) {
                 console.error(`Crawl failed for ${url}:`, error.message);
             }
@@ -57,7 +129,8 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
                 metadata: {
                     url: item.url,
                     description: description,
-                    crawledAt: new Date().toISOString()
                 },
                 timestamp: new Date()
             };

 import axios from 'axios';
 import * as cheerio from 'cheerio';
 import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
+import { URL } from 'url';
+interface CrawlerConfig {
+    mode: 'single' | 'directory' | 'recursive';
+    maxDepth: number;
+    keywords: string[];
+}
 export class WebCrawlerAdapter implements DataSourceAdapter {
     name = 'Web Crawler';
     private visitedUrls: Set<string> = new Set();
     private urlsToFetch: string[] = [];
+    private config: CrawlerConfig = {
+        mode: 'single',
+        maxDepth: 1,
+        keywords: []
+    };
+    constructor() { }
+    configure(config: Partial<CrawlerConfig>) {
+        if (config.mode) this.config.mode = config.mode;
+        if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
+        if (config.keywords) this.config.keywords = config.keywords;
+    }
     /** Add URLs to fetch queue */
     addUrls(urls: string[]): void {
+        const validUrls = urls.filter(u => u.startsWith('http'));
+        this.urlsToFetch.push(...validUrls);
     }
     async fetch(): Promise<any[]> {
         const results: any[] = [];
+        const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));
+        // Clear initial queue as we are processing it now
+        this.urlsToFetch = [];
+        while (queue.length > 0) {
+            const item = queue.shift();
+            if (!item) break;
+            const { url, depth } = item;
             if (this.visitedUrls.has(url)) continue;
             this.visitedUrls.add(url);
+            console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);
             try {
                 const response = await axios.get(url, {
                     timeout: 10000,
+                    headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
                 });
+                const html = response.data;
+                const $ = cheerio.load(html);
+                const textContent = $('body').text().toLowerCase();
+                // Check keywords if configured
+                if (this.config.keywords.length > 0) {
+                    const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
+                    if (!hasKeyword) {
+                        console.log(`⏭️ Skipping ${url} - no matching keywords`);
+                        continue;
+                    }
+                }
+                results.push({ url, html });
+                // Find matching links if we haven't reached max depth
+                if (depth < this.config.maxDepth && this.config.mode !== 'single') {
+                    const baseUrl = new URL(url);
+                    const links = $('a[href]').map((_, el) => $(el).attr('href')).get();
+                    for (const link of links) {
+                        try {
+                            const absoluteUrl = new URL(link, url).toString();
+                            // Domain constraint
+                            if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;
+                            // Mode constraint: Directory
+                            if (this.config.mode === 'directory') {
+                                const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
+                                const linkPath = new URL(absoluteUrl).pathname;
+                                if (!linkPath.startsWith(basePath)) continue;
+                            }
+                            if (!this.visitedUrls.has(absoluteUrl)) {
+                                queue.push({ url: absoluteUrl, depth: depth + 1 });
+                            }
+                        } catch (e) {
+                            // Invalid URL, ignore
+                        }
+                    }
+                }
             } catch (error: any) {
                 console.error(`Crawl failed for ${url}:`, error.message);
             }
                 metadata: {
                     url: item.url,
                     description: description,
+                    crawledAt: new Date().toISOString(),
+                    keywordsMatched: this.config.keywords
                 },
                 timestamp: new Date()
             };