Spaces:
Paused
Paused
Deploy from GitHub Actions 2025-12-15_13-51-53
Browse files
apps/backend/package.json
CHANGED
|
@@ -6,12 +6,13 @@
|
|
| 6 |
"scripts": {
|
| 7 |
"dev": "tsx watch src/index.ts",
|
| 8 |
"build": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --format=esm",
|
| 9 |
-
"build-fixed": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --external:fs --external:path --external:os --format=esm",
|
| 10 |
"build:tsc": "tsc",
|
| 11 |
"start": "node dist/index.js",
|
| 12 |
"test": "vitest run",
|
| 13 |
"neural-bridge": "tsx src/mcp/servers/NeuralBridgeServer.ts",
|
| 14 |
-
"neural-bridge:build": "tsc && node dist/mcp/servers/NeuralBridgeServer.js"
|
|
|
|
| 15 |
},
|
| 16 |
"dependencies": {
|
| 17 |
"@anthropic-ai/sdk": "^0.71.0",
|
|
@@ -86,4 +87,4 @@
|
|
| 86 |
"tsx": "^4.20.6",
|
| 87 |
"typescript": "~5.8.2"
|
| 88 |
}
|
| 89 |
-
}
|
|
|
|
| 6 |
"scripts": {
|
| 7 |
"dev": "tsx watch src/index.ts",
|
| 8 |
"build": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --format=esm",
|
| 9 |
+
"build-fixed": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --external:fs --external:path --external:os --external:dotenv --format=esm",
|
| 10 |
"build:tsc": "tsc",
|
| 11 |
"start": "node dist/index.js",
|
| 12 |
"test": "vitest run",
|
| 13 |
"neural-bridge": "tsx src/mcp/servers/NeuralBridgeServer.ts",
|
| 14 |
+
"neural-bridge:build": "tsc && node dist/mcp/servers/NeuralBridgeServer.js",
|
| 15 |
+
"ingest-drive": "tsx src/scripts/ingest-drive.ts"
|
| 16 |
},
|
| 17 |
"dependencies": {
|
| 18 |
"@anthropic-ai/sdk": "^0.71.0",
|
|
|
|
| 87 |
"tsx": "^4.20.6",
|
| 88 |
"typescript": "~5.8.2"
|
| 89 |
}
|
| 90 |
+
}
|
apps/backend/src/mcp/ingestionHandlers.ts
CHANGED
|
@@ -72,32 +72,39 @@ export async function ingestionStartHandler(params: any): Promise<any> {
|
|
| 72 |
|
| 73 |
export async function ingestionCrawlHandler(params: any): Promise<any> {
|
| 74 |
await initializeAdapters();
|
| 75 |
-
const url = params
|
| 76 |
-
|
| 77 |
if (!url) {
|
| 78 |
return { success: false, error: 'URL is required' };
|
| 79 |
}
|
| 80 |
|
| 81 |
try {
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
return {
|
| 96 |
success: true,
|
| 97 |
-
message: `Crawled
|
| 98 |
count
|
| 99 |
};
|
| 100 |
} catch (error: any) {
|
|
|
|
| 101 |
return { success: false, error: error.message };
|
| 102 |
}
|
| 103 |
}
|
|
@@ -106,7 +113,7 @@ export async function ingestionHarvestHandler(params: any): Promise<any> {
|
|
| 106 |
// Trigger the autonomous harvester
|
| 107 |
// We don't await the full process to prevent timeout, but we start it.
|
| 108 |
autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
|
| 109 |
-
|
| 110 |
return {
|
| 111 |
success: true,
|
| 112 |
message: 'Autonomous Harvest Sequence Initiated',
|
|
@@ -119,7 +126,7 @@ export async function ingestionStatusHandler(params: any): Promise<any> {
|
|
| 119 |
// Get real stats from Vector Store
|
| 120 |
const vectorStore = getNeo4jVectorStore();
|
| 121 |
const stats = await vectorStore.getStatistics();
|
| 122 |
-
|
| 123 |
const engineStatus = dataIngestionEngine.getStatus();
|
| 124 |
|
| 125 |
return {
|
|
|
|
| 72 |
|
| 73 |
export async function ingestionCrawlHandler(params: any): Promise<any> {
|
| 74 |
await initializeAdapters();
|
| 75 |
+
const { url, mode, depth, keywords } = params;
|
| 76 |
+
|
| 77 |
if (!url) {
|
| 78 |
return { success: false, error: 'URL is required' };
|
| 79 |
}
|
| 80 |
|
| 81 |
try {
|
| 82 |
+
const crawler = dataIngestionEngine.getAdapter('Web Crawler') as WebCrawlerAdapter;
|
| 83 |
+
|
| 84 |
+
if (!crawler) {
|
| 85 |
+
return { success: false, error: 'Web Crawler adapter not initialized' };
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
// Configure crawler with request parameters
|
| 89 |
+
crawler.configure({
|
| 90 |
+
mode: mode || 'single',
|
| 91 |
+
maxDepth: depth !== undefined ? depth : 1,
|
| 92 |
+
keywords: keywords || []
|
| 93 |
+
});
|
| 94 |
+
|
| 95 |
+
// Add URL to crawler queue
|
| 96 |
+
crawler.addUrls([url]);
|
| 97 |
+
|
| 98 |
+
// Trigger specific ingestion for the crawler
|
| 99 |
+
const count = await dataIngestionEngine.ingestFrom('Web Crawler');
|
| 100 |
+
|
| 101 |
return {
|
| 102 |
success: true,
|
| 103 |
+
message: `Crawled ${url} (Mode: ${mode || 'single'}, Depth: ${depth}, Keywords: ${keywords?.length || 0})`,
|
| 104 |
count
|
| 105 |
};
|
| 106 |
} catch (error: any) {
|
| 107 |
+
console.error('Crawl handler error:', error);
|
| 108 |
return { success: false, error: error.message };
|
| 109 |
}
|
| 110 |
}
|
|
|
|
| 113 |
// Trigger the autonomous harvester
|
| 114 |
// We don't await the full process to prevent timeout, but we start it.
|
| 115 |
autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
|
| 116 |
+
|
| 117 |
return {
|
| 118 |
success: true,
|
| 119 |
message: 'Autonomous Harvest Sequence Initiated',
|
|
|
|
| 126 |
// Get real stats from Vector Store
|
| 127 |
const vectorStore = getNeo4jVectorStore();
|
| 128 |
const stats = await vectorStore.getStatistics();
|
| 129 |
+
|
| 130 |
const engineStatus = dataIngestionEngine.getStatus();
|
| 131 |
|
| 132 |
return {
|
apps/backend/src/scripts/ingest-drive.ts
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import * as dotenv from 'dotenv';
|
| 2 |
+
import * as path from 'path';
|
| 3 |
+
import { ingestRepository } from '../services/GraphIngestor.js';
|
| 4 |
+
import { neo4jAdapter } from '../adapters/Neo4jAdapter.js';
|
| 5 |
+
|
| 6 |
+
dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });
|
| 7 |
+
|
| 8 |
+
const FORBIDDEN_PATHS = [
|
| 9 |
+
/^c:[\/\\]?$/i, // C:, C:\, C:/
|
| 10 |
+
/^c:[\/\\]windows/i,
|
| 11 |
+
/^c:[\/\\]program files/i,
|
| 12 |
+
/^c:[\/\\]program files \(x86\)/i,
|
| 13 |
+
/^c:[\/\\]users[\/\\][^\\\/]+[\/\\]appdata/i // AppData
|
| 14 |
+
];
|
| 15 |
+
|
| 16 |
+
async function main() {
|
| 17 |
+
const args = process.argv.slice(2);
|
| 18 |
+
const targetPath = args[0];
|
| 19 |
+
|
| 20 |
+
if (!targetPath) {
|
| 21 |
+
console.error('❌ Usage: npm run ingest-drive -- <path>');
|
| 22 |
+
console.error('Example: npm run ingest-drive -- "C:\\Users\\claus\\Documents"');
|
| 23 |
+
process.exit(1);
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
// Safety checks
|
| 27 |
+
const normalizedPath = path.normalize(targetPath);
|
| 28 |
+
|
| 29 |
+
for (const forbidden of FORBIDDEN_PATHS) {
|
| 30 |
+
if (forbidden.test(normalizedPath)) {
|
| 31 |
+
console.error(`❌ DENIED: Ingesting '${normalizedPath}' is restricted for safety reasons.`);
|
| 32 |
+
console.error('Please point to a specific subdirectory (e.g., Documents, Projects).');
|
| 33 |
+
process.exit(1);
|
| 34 |
+
}
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
console.log(`🚀 Starting ingestion of: ${normalizedPath}`);
|
| 38 |
+
console.log('NOTE: This may take a while depending on drive size.');
|
| 39 |
+
|
| 40 |
+
try {
|
| 41 |
+
const result = await ingestRepository({
|
| 42 |
+
rootPath: normalizedPath,
|
| 43 |
+
maxDepth: 10,
|
| 44 |
+
parseContent: true, // Read file content
|
| 45 |
+
generateEmbeddings: true, // Generate vectors
|
| 46 |
+
includePatterns: ['*'],
|
| 47 |
+
excludePatterns: [
|
| 48 |
+
'node_modules', '.git', 'dist', 'build', 'coverage',
|
| 49 |
+
'$Recycle.Bin', 'System Volume Information', 'pagefile.sys', 'swapfile.sys',
|
| 50 |
+
'.vscode', '.idea'
|
| 51 |
+
]
|
| 52 |
+
});
|
| 53 |
+
|
| 54 |
+
if (result.success) {
|
| 55 |
+
console.log('✅ Ingestion completed successfully!');
|
| 56 |
+
console.log('Stats:', result.stats);
|
| 57 |
+
} else {
|
| 58 |
+
console.error('❌ Ingestion finished with errors:', result.errors);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
} catch (error) {
|
| 62 |
+
console.error('💥 Fatal error:', error);
|
| 63 |
+
} finally {
|
| 64 |
+
await neo4jAdapter.close();
|
| 65 |
+
process.exit(0);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
main();
|
apps/backend/src/services/ingestion/DataIngestionEngine.ts
CHANGED
|
@@ -40,6 +40,12 @@ export class DataIngestionEngine {
|
|
| 40 |
console.log(`📥 Registered data adapter: ${adapter.name} (${adapter.type}) - ${canUse ? 'Ready' : 'Awaiting approval'}`);
|
| 41 |
}
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
/** Start ingestion from all registered adapters */
|
| 44 |
async ingestAll(): Promise<void> {
|
| 45 |
if (this.isRunning) {
|
|
|
|
| 40 |
console.log(`📥 Registered data adapter: ${adapter.name} (${adapter.type}) - ${canUse ? 'Ready' : 'Awaiting approval'}`);
|
| 41 |
}
|
| 42 |
|
| 43 |
+
/** Get a registered adapter */
|
| 44 |
+
getAdapter(name: string): DataSourceAdapter | undefined {
|
| 45 |
+
return this.adapters.get(name);
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
/** Start ingestion from all registered adapters */
|
| 50 |
async ingestAll(): Promise<void> {
|
| 51 |
if (this.isRunning) {
|
apps/backend/src/services/ingestion/WebCrawlerAdapter.ts
CHANGED
|
@@ -1,6 +1,13 @@
|
|
| 1 |
import axios from 'axios';
|
| 2 |
import * as cheerio from 'cheerio';
|
| 3 |
import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
export class WebCrawlerAdapter implements DataSourceAdapter {
|
| 6 |
name = 'Web Crawler';
|
|
@@ -8,27 +15,92 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
|
|
| 8 |
|
| 9 |
private visitedUrls: Set<string> = new Set();
|
| 10 |
private urlsToFetch: string[] = [];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
constructor() {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
/** Add URLs to fetch queue */
|
| 15 |
addUrls(urls: string[]): void {
|
| 16 |
-
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
async fetch(): Promise<any[]> {
|
| 20 |
const results: any[] = [];
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
if (this.visitedUrls.has(url)) continue;
|
| 23 |
this.visitedUrls.add(url);
|
| 24 |
|
| 25 |
-
console.log(`🕷️ Crawling: ${url}`);
|
|
|
|
| 26 |
try {
|
| 27 |
const response = await axios.get(url, {
|
| 28 |
timeout: 10000,
|
| 29 |
-
headers: { 'User-Agent': 'WidgeTDC-
|
| 30 |
});
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
} catch (error: any) {
|
| 33 |
console.error(`Crawl failed for ${url}:`, error.message);
|
| 34 |
}
|
|
@@ -57,7 +129,8 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
|
|
| 57 |
metadata: {
|
| 58 |
url: item.url,
|
| 59 |
description: description,
|
| 60 |
-
crawledAt: new Date().toISOString()
|
|
|
|
| 61 |
},
|
| 62 |
timestamp: new Date()
|
| 63 |
};
|
|
|
|
| 1 |
import axios from 'axios';
|
| 2 |
import * as cheerio from 'cheerio';
|
| 3 |
import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
|
| 4 |
+
import { URL } from 'url';
|
| 5 |
+
|
| 6 |
+
interface CrawlerConfig {
|
| 7 |
+
mode: 'single' | 'directory' | 'recursive';
|
| 8 |
+
maxDepth: number;
|
| 9 |
+
keywords: string[];
|
| 10 |
+
}
|
| 11 |
|
| 12 |
export class WebCrawlerAdapter implements DataSourceAdapter {
|
| 13 |
name = 'Web Crawler';
|
|
|
|
| 15 |
|
| 16 |
private visitedUrls: Set<string> = new Set();
|
| 17 |
private urlsToFetch: string[] = [];
|
| 18 |
+
private config: CrawlerConfig = {
|
| 19 |
+
mode: 'single',
|
| 20 |
+
maxDepth: 1,
|
| 21 |
+
keywords: []
|
| 22 |
+
};
|
| 23 |
|
| 24 |
+
constructor() { }
|
| 25 |
+
|
| 26 |
+
configure(config: Partial<CrawlerConfig>) {
|
| 27 |
+
if (config.mode) this.config.mode = config.mode;
|
| 28 |
+
if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
|
| 29 |
+
if (config.keywords) this.config.keywords = config.keywords;
|
| 30 |
+
}
|
| 31 |
|
| 32 |
/** Add URLs to fetch queue */
|
| 33 |
addUrls(urls: string[]): void {
|
| 34 |
+
const validUrls = urls.filter(u => u.startsWith('http'));
|
| 35 |
+
this.urlsToFetch.push(...validUrls);
|
| 36 |
}
|
| 37 |
|
| 38 |
async fetch(): Promise<any[]> {
|
| 39 |
const results: any[] = [];
|
| 40 |
+
const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));
|
| 41 |
+
|
| 42 |
+
// Clear initial queue as we are processing it now
|
| 43 |
+
this.urlsToFetch = [];
|
| 44 |
+
|
| 45 |
+
while (queue.length > 0) {
|
| 46 |
+
const item = queue.shift();
|
| 47 |
+
if (!item) break;
|
| 48 |
+
const { url, depth } = item;
|
| 49 |
+
|
| 50 |
if (this.visitedUrls.has(url)) continue;
|
| 51 |
this.visitedUrls.add(url);
|
| 52 |
|
| 53 |
+
console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);
|
| 54 |
+
|
| 55 |
try {
|
| 56 |
const response = await axios.get(url, {
|
| 57 |
timeout: 10000,
|
| 58 |
+
headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
|
| 59 |
});
|
| 60 |
+
|
| 61 |
+
const html = response.data;
|
| 62 |
+
const $ = cheerio.load(html);
|
| 63 |
+
const textContent = $('body').text().toLowerCase();
|
| 64 |
+
|
| 65 |
+
// Check keywords if configured
|
| 66 |
+
if (this.config.keywords.length > 0) {
|
| 67 |
+
const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
|
| 68 |
+
if (!hasKeyword) {
|
| 69 |
+
console.log(`⏭️ Skipping ${url} - no matching keywords`);
|
| 70 |
+
continue;
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
results.push({ url, html });
|
| 75 |
+
|
| 76 |
+
// Find matching links if we haven't reached max depth
|
| 77 |
+
if (depth < this.config.maxDepth && this.config.mode !== 'single') {
|
| 78 |
+
const baseUrl = new URL(url);
|
| 79 |
+
const links = $('a[href]').map((_, el) => $(el).attr('href')).get();
|
| 80 |
+
|
| 81 |
+
for (const link of links) {
|
| 82 |
+
try {
|
| 83 |
+
const absoluteUrl = new URL(link, url).toString();
|
| 84 |
+
|
| 85 |
+
// Domain constraint
|
| 86 |
+
if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;
|
| 87 |
+
|
| 88 |
+
// Mode constraint: Directory
|
| 89 |
+
if (this.config.mode === 'directory') {
|
| 90 |
+
const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
|
| 91 |
+
const linkPath = new URL(absoluteUrl).pathname;
|
| 92 |
+
if (!linkPath.startsWith(basePath)) continue;
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
if (!this.visitedUrls.has(absoluteUrl)) {
|
| 96 |
+
queue.push({ url: absoluteUrl, depth: depth + 1 });
|
| 97 |
+
}
|
| 98 |
+
} catch (e) {
|
| 99 |
+
// Invalid URL, ignore
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
} catch (error: any) {
|
| 105 |
console.error(`Crawl failed for ${url}:`, error.message);
|
| 106 |
}
|
|
|
|
| 129 |
metadata: {
|
| 130 |
url: item.url,
|
| 131 |
description: description,
|
| 132 |
+
crawledAt: new Date().toISOString(),
|
| 133 |
+
keywordsMatched: this.config.keywords
|
| 134 |
},
|
| 135 |
timestamp: new Date()
|
| 136 |
};
|