Kraft102 commited on
Commit
a84b07b
·
verified ·
1 Parent(s): 66f3b51

Deploy from GitHub Actions 2025-12-15_13-51-53

Browse files
apps/backend/package.json CHANGED
@@ -6,12 +6,13 @@
6
  "scripts": {
7
  "dev": "tsx watch src/index.ts",
8
  "build": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --format=esm",
9
- "build-fixed": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --external:fs --external:path --external:os --format=esm",
10
  "build:tsc": "tsc",
11
  "start": "node dist/index.js",
12
  "test": "vitest run",
13
  "neural-bridge": "tsx src/mcp/servers/NeuralBridgeServer.ts",
14
- "neural-bridge:build": "tsc && node dist/mcp/servers/NeuralBridgeServer.js"
 
15
  },
16
  "dependencies": {
17
  "@anthropic-ai/sdk": "^0.71.0",
@@ -86,4 +87,4 @@
86
  "tsx": "^4.20.6",
87
  "typescript": "~5.8.2"
88
  }
89
- }
 
6
  "scripts": {
7
  "dev": "tsx watch src/index.ts",
8
  "build": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --format=esm",
9
+ "build-fixed": "esbuild src/index.ts --bundle --platform=node --target=node20 --outfile=dist/index.js --external:@prisma/client --external:better-sqlite3 --external:pg-native --external:@xenova/transformers --external:onnxruntime-node --external:sharp --external:canvas --external:fs --external:path --external:os --external:dotenv --format=esm",
10
  "build:tsc": "tsc",
11
  "start": "node dist/index.js",
12
  "test": "vitest run",
13
  "neural-bridge": "tsx src/mcp/servers/NeuralBridgeServer.ts",
14
+ "neural-bridge:build": "tsc && node dist/mcp/servers/NeuralBridgeServer.js",
15
+ "ingest-drive": "tsx src/scripts/ingest-drive.ts"
16
  },
17
  "dependencies": {
18
  "@anthropic-ai/sdk": "^0.71.0",
 
87
  "tsx": "^4.20.6",
88
  "typescript": "~5.8.2"
89
  }
90
+ }
apps/backend/src/mcp/ingestionHandlers.ts CHANGED
@@ -72,32 +72,39 @@ export async function ingestionStartHandler(params: any): Promise<any> {
72
 
73
  export async function ingestionCrawlHandler(params: any): Promise<any> {
74
  await initializeAdapters();
75
- const url = params.url;
76
-
77
  if (!url) {
78
  return { success: false, error: 'URL is required' };
79
  }
80
 
81
  try {
82
- // Force the Web Crawler adapter to handle this URL
83
- // In a real system, the engine might route based on pattern, but here we invoke directly or rely on the engine identifying it's a URL
84
- // Since we registered WebCrawler, ingestionEngine.ingestFrom(url) should work if the adapter's fetch handles it.
85
- // However, standard ingestFrom typically takes an adapter ID or similar if it's scanning.
86
- // But WebCrawlerAdapter.fetch takes a source string.
87
- // We need to ensure dataIngestionEngine routes "http..." strings to WebCrawlerAdapter.
88
-
89
- // For now, we can manually invoke the adapter if we can access it, OR simpler:
90
- // We trust ingestFrom to iterate adapters or we specifically use the crawler instance if we exposed it.
91
- // Let's assume ingestFrom(url) works if we modified DataIngestionEngine to handle specific inputs,
92
- // OR we just perform the crawl here directly using a new instance if needed, but better to go through engine.
93
-
94
- const count = await dataIngestionEngine.ingestFrom(url);
 
 
 
 
 
 
95
  return {
96
  success: true,
97
- message: `Crawled and ingested ${url}`,
98
  count
99
  };
100
  } catch (error: any) {
 
101
  return { success: false, error: error.message };
102
  }
103
  }
@@ -106,7 +113,7 @@ export async function ingestionHarvestHandler(params: any): Promise<any> {
106
  // Trigger the autonomous harvester
107
  // We don't await the full process to prevent timeout, but we start it.
108
  autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
109
-
110
  return {
111
  success: true,
112
  message: 'Autonomous Harvest Sequence Initiated',
@@ -119,7 +126,7 @@ export async function ingestionStatusHandler(params: any): Promise<any> {
119
  // Get real stats from Vector Store
120
  const vectorStore = getNeo4jVectorStore();
121
  const stats = await vectorStore.getStatistics();
122
-
123
  const engineStatus = dataIngestionEngine.getStatus();
124
 
125
  return {
 
72
 
73
  export async function ingestionCrawlHandler(params: any): Promise<any> {
74
  await initializeAdapters();
75
+ const { url, mode, depth, keywords } = params;
76
+
77
  if (!url) {
78
  return { success: false, error: 'URL is required' };
79
  }
80
 
81
  try {
82
+ const crawler = dataIngestionEngine.getAdapter('Web Crawler') as WebCrawlerAdapter;
83
+
84
+ if (!crawler) {
85
+ return { success: false, error: 'Web Crawler adapter not initialized' };
86
+ }
87
+
88
+ // Configure crawler with request parameters
89
+ crawler.configure({
90
+ mode: mode || 'single',
91
+ maxDepth: depth !== undefined ? depth : 1,
92
+ keywords: keywords || []
93
+ });
94
+
95
+ // Add URL to crawler queue
96
+ crawler.addUrls([url]);
97
+
98
+ // Trigger specific ingestion for the crawler
99
+ const count = await dataIngestionEngine.ingestFrom('Web Crawler');
100
+
101
  return {
102
  success: true,
103
+ message: `Crawled ${url} (Mode: ${mode || 'single'}, Depth: ${depth}, Keywords: ${keywords?.length || 0})`,
104
  count
105
  };
106
  } catch (error: any) {
107
+ console.error('Crawl handler error:', error);
108
  return { success: false, error: error.message };
109
  }
110
  }
 
113
  // Trigger the autonomous harvester
114
  // We don't await the full process to prevent timeout, but we start it.
115
  autonomousHarvester.startHarvest().catch(err => console.error("Harvest background error:", err));
116
+
117
  return {
118
  success: true,
119
  message: 'Autonomous Harvest Sequence Initiated',
 
126
  // Get real stats from Vector Store
127
  const vectorStore = getNeo4jVectorStore();
128
  const stats = await vectorStore.getStatistics();
129
+
130
  const engineStatus = dataIngestionEngine.getStatus();
131
 
132
  return {
apps/backend/src/scripts/ingest-drive.ts ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as dotenv from 'dotenv';
2
+ import * as path from 'path';
3
+ import { ingestRepository } from '../services/GraphIngestor.js';
4
+ import { neo4jAdapter } from '../adapters/Neo4jAdapter.js';
5
+
6
+ dotenv.config({ path: path.resolve(process.cwd(), 'apps/backend/.env') });
7
+
8
+ const FORBIDDEN_PATHS = [
9
+ /^c:[\/\\]?$/i, // C:, C:\, C:/
10
+ /^c:[\/\\]windows/i,
11
+ /^c:[\/\\]program files/i,
12
+ /^c:[\/\\]program files \(x86\)/i,
13
+ /^c:[\/\\]users[\/\\][^\\\/]+[\/\\]appdata/i // AppData
14
+ ];
15
+
16
+ async function main() {
17
+ const args = process.argv.slice(2);
18
+ const targetPath = args[0];
19
+
20
+ if (!targetPath) {
21
+ console.error('❌ Usage: npm run ingest-drive -- <path>');
22
+ console.error('Example: npm run ingest-drive -- "C:\\Users\\claus\\Documents"');
23
+ process.exit(1);
24
+ }
25
+
26
+ // Safety checks
27
+ const normalizedPath = path.normalize(targetPath);
28
+
29
+ for (const forbidden of FORBIDDEN_PATHS) {
30
+ if (forbidden.test(normalizedPath)) {
31
+ console.error(`❌ DENIED: Ingesting '${normalizedPath}' is restricted for safety reasons.`);
32
+ console.error('Please point to a specific subdirectory (e.g., Documents, Projects).');
33
+ process.exit(1);
34
+ }
35
+ }
36
+
37
+ console.log(`🚀 Starting ingestion of: ${normalizedPath}`);
38
+ console.log('NOTE: This may take a while depending on drive size.');
39
+
40
+ try {
41
+ const result = await ingestRepository({
42
+ rootPath: normalizedPath,
43
+ maxDepth: 10,
44
+ parseContent: true, // Read file content
45
+ generateEmbeddings: true, // Generate vectors
46
+ includePatterns: ['*'],
47
+ excludePatterns: [
48
+ 'node_modules', '.git', 'dist', 'build', 'coverage',
49
+ '$Recycle.Bin', 'System Volume Information', 'pagefile.sys', 'swapfile.sys',
50
+ '.vscode', '.idea'
51
+ ]
52
+ });
53
+
54
+ if (result.success) {
55
+ console.log('✅ Ingestion completed successfully!');
56
+ console.log('Stats:', result.stats);
57
+ } else {
58
+ console.error('❌ Ingestion finished with errors:', result.errors);
59
+ }
60
+
61
+ } catch (error) {
62
+ console.error('💥 Fatal error:', error);
63
+ } finally {
64
+ await neo4jAdapter.close();
65
+ process.exit(0);
66
+ }
67
+ }
68
+
69
+ main();
apps/backend/src/services/ingestion/DataIngestionEngine.ts CHANGED
@@ -40,6 +40,12 @@ export class DataIngestionEngine {
40
  console.log(`📥 Registered data adapter: ${adapter.name} (${adapter.type}) - ${canUse ? 'Ready' : 'Awaiting approval'}`);
41
  }
42
 
 
 
 
 
 
 
43
  /** Start ingestion from all registered adapters */
44
  async ingestAll(): Promise<void> {
45
  if (this.isRunning) {
 
40
  console.log(`📥 Registered data adapter: ${adapter.name} (${adapter.type}) - ${canUse ? 'Ready' : 'Awaiting approval'}`);
41
  }
42
 
43
+ /** Get a registered adapter */
44
+ getAdapter(name: string): DataSourceAdapter | undefined {
45
+ return this.adapters.get(name);
46
+ }
47
+
48
+
49
  /** Start ingestion from all registered adapters */
50
  async ingestAll(): Promise<void> {
51
  if (this.isRunning) {
apps/backend/src/services/ingestion/WebCrawlerAdapter.ts CHANGED
@@ -1,6 +1,13 @@
1
  import axios from 'axios';
2
  import * as cheerio from 'cheerio';
3
  import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
 
 
 
 
 
 
 
4
 
5
  export class WebCrawlerAdapter implements DataSourceAdapter {
6
  name = 'Web Crawler';
@@ -8,27 +15,92 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
8
 
9
  private visitedUrls: Set<string> = new Set();
10
  private urlsToFetch: string[] = [];
 
 
 
 
 
11
 
12
- constructor() {}
 
 
 
 
 
 
13
 
14
  /** Add URLs to fetch queue */
15
  addUrls(urls: string[]): void {
16
- this.urlsToFetch = urls.filter(u => u.startsWith('http'));
 
17
  }
18
 
19
  async fetch(): Promise<any[]> {
20
  const results: any[] = [];
21
- for (const url of this.urlsToFetch) {
 
 
 
 
 
 
 
 
 
22
  if (this.visitedUrls.has(url)) continue;
23
  this.visitedUrls.add(url);
24
 
25
- console.log(`🕷️ Crawling: ${url}`);
 
26
  try {
27
  const response = await axios.get(url, {
28
  timeout: 10000,
29
- headers: { 'User-Agent': 'WidgeTDC-Crawler/1.0' }
30
  });
31
- results.push({ url, html: response.data });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  } catch (error: any) {
33
  console.error(`Crawl failed for ${url}:`, error.message);
34
  }
@@ -57,7 +129,8 @@ export class WebCrawlerAdapter implements DataSourceAdapter {
57
  metadata: {
58
  url: item.url,
59
  description: description,
60
- crawledAt: new Date().toISOString()
 
61
  },
62
  timestamp: new Date()
63
  };
 
1
  import axios from 'axios';
2
  import * as cheerio from 'cheerio';
3
  import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
4
+ import { URL } from 'url';
5
+
6
+ interface CrawlerConfig {
7
+ mode: 'single' | 'directory' | 'recursive';
8
+ maxDepth: number;
9
+ keywords: string[];
10
+ }
11
 
12
  export class WebCrawlerAdapter implements DataSourceAdapter {
13
  name = 'Web Crawler';
 
15
 
16
  private visitedUrls: Set<string> = new Set();
17
  private urlsToFetch: string[] = [];
18
+ private config: CrawlerConfig = {
19
+ mode: 'single',
20
+ maxDepth: 1,
21
+ keywords: []
22
+ };
23
 
24
+ constructor() { }
25
+
26
+ configure(config: Partial<CrawlerConfig>) {
27
+ if (config.mode) this.config.mode = config.mode;
28
+ if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
29
+ if (config.keywords) this.config.keywords = config.keywords;
30
+ }
31
 
32
  /** Add URLs to fetch queue */
33
  addUrls(urls: string[]): void {
34
+ const validUrls = urls.filter(u => u.startsWith('http'));
35
+ this.urlsToFetch.push(...validUrls);
36
  }
37
 
38
  async fetch(): Promise<any[]> {
39
  const results: any[] = [];
40
+ const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));
41
+
42
+ // Clear initial queue as we are processing it now
43
+ this.urlsToFetch = [];
44
+
45
+ while (queue.length > 0) {
46
+ const item = queue.shift();
47
+ if (!item) break;
48
+ const { url, depth } = item;
49
+
50
  if (this.visitedUrls.has(url)) continue;
51
  this.visitedUrls.add(url);
52
 
53
+ console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);
54
+
55
  try {
56
  const response = await axios.get(url, {
57
  timeout: 10000,
58
+ headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
59
  });
60
+
61
+ const html = response.data;
62
+ const $ = cheerio.load(html);
63
+ const textContent = $('body').text().toLowerCase();
64
+
65
+ // Check keywords if configured
66
+ if (this.config.keywords.length > 0) {
67
+ const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
68
+ if (!hasKeyword) {
69
+ console.log(`⏭️ Skipping ${url} - no matching keywords`);
70
+ continue;
71
+ }
72
+ }
73
+
74
+ results.push({ url, html });
75
+
76
+ // Find matching links if we haven't reached max depth
77
+ if (depth < this.config.maxDepth && this.config.mode !== 'single') {
78
+ const baseUrl = new URL(url);
79
+ const links = $('a[href]').map((_, el) => $(el).attr('href')).get();
80
+
81
+ for (const link of links) {
82
+ try {
83
+ const absoluteUrl = new URL(link, url).toString();
84
+
85
+ // Domain constraint
86
+ if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;
87
+
88
+ // Mode constraint: Directory
89
+ if (this.config.mode === 'directory') {
90
+ const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
91
+ const linkPath = new URL(absoluteUrl).pathname;
92
+ if (!linkPath.startsWith(basePath)) continue;
93
+ }
94
+
95
+ if (!this.visitedUrls.has(absoluteUrl)) {
96
+ queue.push({ url: absoluteUrl, depth: depth + 1 });
97
+ }
98
+ } catch (e) {
99
+ // Invalid URL, ignore
100
+ }
101
+ }
102
+ }
103
+
104
  } catch (error: any) {
105
  console.error(`Crawl failed for ${url}:`, error.message);
106
  }
 
129
  metadata: {
130
  url: item.url,
131
  description: description,
132
+ crawledAt: new Date().toISOString(),
133
+ keywordsMatched: this.config.keywords
134
  },
135
  timestamp: new Date()
136
  };